Skip to content

Instantly share code, notes, and snippets.

@cognitivetech
Created June 10, 2024 10:09
Show Gist options
  • Save cognitivetech/7fa7cc5bf9e852dacca6b4aae3c5c73d to your computer and use it in GitHub Desktop.
Save cognitivetech/7fa7cc5bf9e852dacca6b4aae3c5c73d to your computer and use it in GitHub Desktop.
# https://huggingface.co/spaces/Ailyth/Multi-voice-TTS-GPT-SoVITS
# Clone and follow install instructions in linked colab ^^^^
# then use this script based on that code
#
# python cli.Multi-voice-TTS-GPT-SoVITS.py --audio {input.wav} --text sample.md --language English
#
# for sample.md I divided sections by `##` h2 headers, and put paragraphs on each a single line.
# the output is generated `out/filename/h2-name-00.wav` so each file gets a different folder and each heading gets a different output prefix
#
# See sample input here in gist
import argparse
import numpy as np
import soundfile as sf
import torch
import librosa
from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer
from datetime import datetime
from time import time as ttime
from timeit import default_timer as timer
from polyglot.detect import Detector
from feature_extractor import cnhubert
from module.models import SynthesizerTrn
from module.mel_processing import spectrogram_torch
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from my_utils import load_audio
import os, re, sys, pytz, random
import os,re,sys,LangSegment,librosa,pdb,torch,pytz,random
from text.cleaner import clean_text
from text import cleaned_text_to_sequence
if "_CUDA_VISIBLE_DEVICES" in os.environ:
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
tz = pytz.timezone('Asia/Singapore')
device = "cuda" if torch.cuda.is_available() else "cpu"
is_half = eval(
os.environ.get("is_half", "True" if torch.cuda.is_available() else "False")
)
splits = {"?", "!", ".", "?", "!", ":", ":", "—", "…", }
# Add your imports and other necessary code here
whisper_path = os.environ.get("whisper_path", "pretrained_models/whisper-tiny")
if not os.path.exists(whisper_path):
whisper_path = "openai/whisper-tiny"
def get_first(text):
pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
text = re.split(pattern, text)[0].strip()
return text
cnhubert_base_path = os.environ.get("cnhubert_base_path", "pretrained_models/chinese-hubert-base")
bert_path = os.environ.get("bert_path", "pretrained_models/chinese-roberta-wwm-ext-large")
if not os.path.exists(cnhubert_base_path):
cnhubert_base_path = "TencentGameMate/chinese-hubert-base"
if not os.path.exists(bert_path):
bert_path = "hfl/chinese-roberta-wwm-ext-large"
cnhubert.cnhubert_base_path = cnhubert_base_path
tz = pytz.timezone('Asia/Singapore')
ssl_model = cnhubert.get_model()
if is_half == True:
ssl_model = ssl_model.half().to(device)
else:
ssl_model = ssl_model.to(device)
def abs_path(dir):
global_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
return(os.path.join(global_dir, dir))
gpt_path = abs_path("MODELS/22/22.ckpt")
sovits_path=abs_path("MODELS/22/22.pth")
cnhubert_base_path = os.environ.get("cnhubert_base_path", "pretrained_models/chinese-hubert-base")
bert_path = os.environ.get("bert_path", "pretrained_models/chinese-roberta-wwm-ext-large")
def change_gpt_weights(gpt_path):
global hz, max_sec, t2s_model, config
hz = 50
dict_s1 = torch.load(gpt_path, map_location="cpu")
config = dict_s1["config"]
max_sec = config["data"]["max_sec"]
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
t2s_model.load_state_dict(dict_s1["weight"])
if is_half == True:
t2s_model = t2s_model.half()
t2s_model = t2s_model.to(device)
t2s_model.eval()
total = sum([param.nelement() for param in t2s_model.parameters()])
print("Number of parameter: %.2fM" % (total / 1e6))
with open("./gweight.txt", "w", encoding="utf-8") as f: f.write(gpt_path)
change_gpt_weights(gpt_path)
dict_language = {
("中文1"): "all_zh",#全部按中文识别
("English"): "en",#全部按英文识别#######不变
("日文1"): "all_ja",#全部按日文识别
("中文"): "zh",#按中英混合识别####不变
("日本語"): "ja",#按日英混合识别####不变
("混合"): "auto",#多语种启动切分识别语种
}
class DictToAttrRecursive(dict):
def __init__(self, input_dict):
super().__init__(input_dict)
for key, value in input_dict.items():
if isinstance(value, dict):
value = DictToAttrRecursive(value)
self[key] = value
setattr(self, key, value)
def __getattr__(self, item):
try:
return self[item]
except KeyError:
raise AttributeError(f"Attribute {item} not found")
def __setattr__(self, key, value):
if isinstance(value, dict):
value = DictToAttrRecursive(value)
super(DictToAttrRecursive, self).__setitem__(key, value)
super().__setattr__(key, value)
def __delattr__(self, item):
try:
del self[item]
except KeyError:
raise AttributeError(f"Attribute {item} not found")
def change_sovits_weights(sovits_path):
global vq_model, hps
dict_s2 = torch.load(sovits_path, map_location="cpu")
hps = dict_s2["config"]
hps = DictToAttrRecursive(hps)
hps.model.semantic_frame_rate = "25hz"
vq_model = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model
)
if ("pretrained" not in sovits_path):
del vq_model.enc_q
if is_half == True:
vq_model = vq_model.half().to(device)
else:
vq_model = vq_model.to(device)
vq_model.eval()
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
with open("./sweight.txt", "w", encoding="utf-8") as f:
f.write(sovits_path)
change_sovits_weights(sovits_path)
pipe = pipeline(
task="automatic-speech-recognition",
model=whisper_path,
chunk_length_s=30,
device=device,)
def duration(audio_file_path):
return True
def trim_text(text,language):
limit_cj = 120 #character
limit_en = 200 #words
search_limit_cj = limit_cj+30
search_limit_en = limit_en +30
text = text.replace('\n', '').strip()
if language =='English':
words = text.split()
if len(words) <= limit_en:
return text
# English
for i in range(limit_en, -1, -1):
if any(punct in words[i] for punct in splits):
return ' '.join(words[:i+1])
for i in range(limit_en, min(len(words), search_limit_en)):
if any(punct in words[i] for punct in splits):
return ' '.join(words[:i+1])
return ' '.join(words[:limit_en])
else:#中文日文
if len(text) <= limit_cj:
return text
for i in range(limit_cj, -1, -1):
if text[i] in splits:
return text[:i+1]
for i in range(limit_cj, min(len(text), search_limit_cj)):
if text[i] in splits:
return text[:i+1]
return text[:limit_cj]
def transcribe(voice):
time1=timer()
tprint('⚡Start Clone - transcribe')
task="transcribe"
if voice is None:
wprint("No audio file submitted! Please upload or record an audio file before submitting your request.")
R = pipe(voice, batch_size=8, generate_kwargs={"task": task}, return_timestamps=True,return_language=True)
text=R['text']
lang=R['chunks'][0]['language']
if lang=='english':
language='English'
elif lang =='chinese':
language='中文'
elif lang=='japanese':
language = '日本語'
time2=timer()
tprint(f'transcribe COMPLETE,{round(time2-time1,4)}s')
tprint(f'\nTRANSCRIBE RESULT:\n 🔣Language:{language} \n 🔣Text:{text}' )
return text,language
def tprint(text):
now=datetime.now(tz).strftime('%H:%M:%S')
print(f'UTC+8 - {now} - {text}')
def get_cleaned_text_final(text,language):
if language in {"en","all_zh","all_ja"}:
phones, word2ph, norm_text = clean_text_inf(text, language)
elif language in {"zh", "ja","auto"}:
phones, word2ph, norm_text = nonen_clean_text_inf(text, language)
return phones, word2ph, norm_text
def get_bert_final(phones, word2ph, text,language,device):
if language == "en":
bert = get_bert_inf(phones, word2ph, text, language)
elif language in {"zh", "ja","auto"}:
bert = nonen_get_bert_inf(text, language)
elif language == "all_zh":
bert = get_bert_feature(text, word2ph).to(device)
else:
bert = torch.zeros((1024, len(phones))).to(device)
return bert
def get_spepc(hps, filename):
audio = load_audio(filename, int(hps.data.sampling_rate))
audio = torch.FloatTensor(audio)
audio_norm = audio
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram_torch(
audio_norm,
hps.data.filter_length,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
center=False,
)
return spec
dtype=torch.float16 if is_half == True else torch.float32
def get_bert_inf(phones, word2ph, norm_text, language):
language=language.replace("all_","")
if language == "zh":
bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype)
else:
bert = torch.zeros(
(1024, len(phones)),
dtype=torch.float16 if is_half == True else torch.float32,
).to(device)
return bert
def merge_short_text_in_array(texts, threshold):
if (len(texts)) < 2:
return texts
result = []
text = ""
for ele in texts:
text += ele
if len(text) >= threshold:
result.append(text)
text = ""
if (len(text) > 0):
if len(result) == 0:
result.append(text)
else:
result[len(result) - 1] += text
return result
def clean_text_inf(text, language):
formattext = ""
language = language.replace("all_","")
for tmp in LangSegment.getTexts(text):
if language == "ja":
if tmp["lang"] == language or tmp["lang"] == "zh":
formattext += tmp["text"] + " "
continue
if tmp["lang"] == language:
formattext += tmp["text"] + " "
while " " in formattext:
formattext = formattext.replace(" ", " ")
phones, word2ph, norm_text = clean_text(formattext, language)
phones = cleaned_text_to_sequence(phones)
return phones, word2ph, norm_text
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=("Do not split"), volume_scale=1.0):
if not duration(ref_wav_path):
return None
if text == '':
wprint("Please enter text to generate/请输入生成文字")
return None
t0 = ttime()
startTime=timer()
text=trim_text(text,text_language)
change_sovits_weights(sovits_path)
tprint(f'🏕️LOADED SoVITS Model: {sovits_path}')
change_gpt_weights(gpt_path)
tprint(f'🏕️LOADED GPT Model: {gpt_path}')
prompt_language = dict_language[prompt_language]
try:
text_language = dict_language[text_language]
except KeyError as e:
wprint(f"Unsupported language type: {e}")
return None
prompt_text = prompt_text.strip("\n")
if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
text = text.strip("\n")
if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
#print(("实际输入的参考文本:"), prompt_text)
#print(("📝实际输入的目标文本:"), text)
zero_wav = np.zeros(
int(hps.data.sampling_rate * 0.3),
dtype=np.float16 if is_half == True else np.float32,
)
with torch.no_grad():
wav16k, sr = librosa.load(ref_wav_path, sr=16000)
if (wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000):
errinfo='参考音频在3~10秒范围外,请更换!'
raise OSError((errinfo))
wav16k = torch.from_numpy(wav16k)
zero_wav_torch = torch.from_numpy(zero_wav)
if is_half == True:
wav16k = wav16k.half().to(device)
zero_wav_torch = zero_wav_torch.half().to(device)
else:
wav16k = wav16k.to(device)
zero_wav_torch = zero_wav_torch.to(device)
wav16k = torch.cat([wav16k, zero_wav_torch])
ssl_content = ssl_model.model(wav16k.unsqueeze(0))[
"last_hidden_state"
].transpose(
1, 2
) # .float()
codes = vq_model.extract_latent(ssl_content)
prompt_semantic = codes[0, 0]
t1 = ttime()
phones1, word2ph1, norm_text1=get_cleaned_text_final(prompt_text, prompt_language)
if (how_to_cut == ("Split into groups of 4 sentences")):
text = cut1(text)
elif (how_to_cut == ("Split every 50 characters")):
text = cut2(text)
elif (how_to_cut == ("Split at CN/JP periods (。)")):
text = cut3(text)
elif (how_to_cut == ("Split at English periods (.)")):
text = cut4(text)
elif (how_to_cut == ("Split at punctuation marks")):
text = cut5(text)
while "\n\n" in text:
text = text.replace("\n\n", "\n")
print(f"🧨实际输入的目标文本(切句后):{text}\n")
texts = text.split("\n")
texts = merge_short_text_in_array(texts, 5)
audio_opt = []
bert1=get_bert_final(phones1, word2ph1, norm_text1,prompt_language,device).to(dtype)
for text in texts:
if (len(text.strip()) == 0):
continue
if (text[-1] not in splits): text += "。" if text_language != "en" else "."
print(("\n🎈实际输入的目标文本(每句):"), text)
phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
try:
bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
except RuntimeError as e:
wprint(f"The input text does not match the language/输入文本与语言不匹配: {e}")
return None
bert = torch.cat([bert1, bert2], 1)
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
bert = bert.to(device).unsqueeze(0)
all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
prompt = prompt_semantic.unsqueeze(0).to(device)
t2 = ttime()
with torch.no_grad():
# pred_semantic = t2s_model.model.infer(
pred_semantic, idx = t2s_model.model.infer_panel(
all_phoneme_ids,
all_phoneme_len,
prompt,
bert,
# prompt_phone_len=ph_offset,
top_k=config["inference"]["top_k"],
early_stop_num=hz * max_sec,
)
t3 = ttime()
# print(pred_semantic.shape,idx)
pred_semantic = pred_semantic[:, -idx:].unsqueeze(
0
) # .unsqueeze(0)#mq要多unsqueeze一次
refer = get_spepc(hps, ref_wav_path) # .to(device)
if is_half == True:
refer = refer.half().to(device)
else:
refer = refer.to(device)
# audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
try:
audio = (
vq_model.decode(
pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer
)
.detach()
.cpu()
.numpy()[0, 0]
)
except RuntimeError as e:
wprint(f"The input text does not match the language/输入文本与语言不匹配: {e}")
return None
max_audio=np.abs(audio).max()
if max_audio>1:audio/=max_audio
audio_opt.append(audio)
audio_opt.append(zero_wav)
t4 = ttime()
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
#yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
audio_data = (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
audio_data = (audio_data.astype(np.float32) * volume_scale).astype(np.int16)
output_wav = "output_audio.wav"
sf.write(output_wav, audio_data, hps.data.sampling_rate)
endTime=timer()
tprint(f'🆗TTS COMPLETE,{round(endTime-startTime,4)}s')
return output_wav
def clone_voice(user_voice, user_text, user_lang):
if not duration(user_voice):
return None
if user_text == '':
wprint("Please enter text to generate/请输入生成文字")
return None
user_text = trim_text(user_text, user_lang)
time1 = timer()
global gpt_path, sovits_path
gpt_path = abs_path("pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
sovits_path = abs_path("pretrained_models/s2G488k.pth")
try:
prompt_text, prompt_language = transcribe(user_voice)
except UnboundLocalError as e:
wprint(f"The language in the audio cannot be recognized :{str(e)}")
return None
output_wav = get_tts_wav(
user_voice,
prompt_text,
prompt_language,
user_text,
user_lang,
how_to_cut="Do not split",
volume_scale=1.0)
time2 = timer()
tprint(f'🆗CLONE COMPLETE,{round(time2-time1,4)}s')
return output_wav
def process_text_file(file_path, user_voice, user_lang):
with open(file_path, 'r') as file:
lines = file.readlines()
current_heading = None
file_counter = 0
# Extract the base name of the input text file (without extension)
base_name = os.path.splitext(os.path.basename(file_path))[0]
output_dir = os.path.join('out', base_name)
os.makedirs(output_dir, exist_ok=True)
for line in lines:
line = line.strip()
if line.startswith('##'):
current_heading = line[2:].strip().lower().replace(' ', '-')
file_counter = 0
elif current_heading:
output_wav = clone_voice(user_voice, line, user_lang)
if output_wav:
output_filename = f"{output_dir}/{current_heading}-{file_counter:02d}.wav"
os.rename(output_wav, output_filename)
print(f"Generated audio saved to: {output_filename}")
file_counter += 1
def main():
parser = argparse.ArgumentParser(description="Clone custom voice from terminal")
parser.add_argument("--audio", type=str, required=True, help="Path to the source audio file")
parser.add_argument("--text", type=str, required=True, help="Path to the text file to read")
parser.add_argument("--language", type=str, default="English", help="Language of the text (default: English)")
args = parser.parse_args()
user_voice = args.audio
user_text_file = args.text
user_lang = args.language
process_text_file(user_text_file, user_voice, user_lang)
if __name__ == "__main__":
main()

A Tranquil Himalayan Forest Visualization

"Imagine yourself standing amidst a breathtaking panoramic view of the snow-capped Himalayan peaks, their jagged silhouettes piercing the sky with imposing grandeur. Take in the vast, sweeping vistas of the mountain range, with its undulating ridges and deep valleys carved by ancient glaciers. Notice the play of light and shadow on the rugged terrain, creating a constantly shifting tapestry of colors and textures. Let your gaze wander to the dense, verdant forests that blanket the lower slopes of the mountains, with their rich tapestry of greens ranging from emerald to deep olive hues." "As you step into the forest, take a moment to appreciate the towering trees surrounding you. Their trunks are adorned with intricate patterns of bark, and their branches reach skyward, filtering the sunlight into a dappled dance on the forest floor. Notice the vibrant undergrowth, with ferns unfurling their delicate fronds and mosses carpeting the ground in a lush, velvety embrace. Run your hands along the rough, ridged texture of the bark, feeling the grooves and irregularities beneath your fingertips. Sink your bare feet into the soft, spongy texture of the moss-covered forest floor, relishing the lush, velvety carpet. Find yourself drawn to a secluded clearing within the forest, a tranquil oasis surrounded by towering trees and the gentle whispers of the wind through the leaves." "In the clearing, feel the soft, verdant grass beneath your feet, dotted with wildflowers in a kaleidoscope of colors, from vibrant reds and oranges to delicate purples and whites. Bask in the warm, inviting ambiance created by the dappled sunlight filtering through the canopy. Notice the crystal-clear streams and rivers that meander through the forest, their waters reflecting the surrounding beauty like a mirror. Take a moment to appreciate the gentle cascades and waterfalls, their waters tumbling over moss-covered rocks and creating a soothing, rhythmic melody. Dip your hands into the cool, silky water, feeling the gentle ripples and eddies created by the waters movement." "As you explore the forest, keep an eye out for the diverse wildlife that calls this place home. You might spot graceful deer grazing in the clearings, colorful birds flitting among the branches, or majestic eagles soaring high above the treetops. Take note of the ancient, gnarled trees with twisted trunks and sprawling branches, or the towering rock formations adorned with vibrant lichens and mosses. Run your hands along the smooth, cool surface of the rocks, feeling the subtle variations in texture and temperature. Allow yourself to be captivated by the ever-changing tapestry of nature, with each season bringing new sights and wonders to behold." "Tune into the gentle rustling of leaves as a soft breeze passes through the forest canopy, creating a soothing, rhythmic whisper. Listen for the melodic songs of various birds, each with its unique call, creating a harmonious symphony of natures music. Notice the sound of small woodland creatures scurrying through the undergrowth, their footsteps creating a gentle patter on the forest floor. Let the soothing sounds of a nearby stream or river wash over you, with its waters gently flowing over rocks and creating a calming, rippling melody. In the distance, hear the thunderous roar of a waterfall, its cascading waters echoing through the forest. Embrace the moments of profound silence, punctuated only by the occasional sounds of nature, creating a sense of peace and serenity." "Breathe in the rich, earthy aroma of the forest floor, a blend of decaying leaves, moss, and damp soil, creating a grounding and natural scent. Notice the distinct, woody fragrance of the towering trees, with their bark and sap releasing a warm, comforting scent. Delight in the delicate, sweet fragrance of wildflowers blooming in the clearings and along the forest paths, creating a refreshing aroma. Fill your lungs with the crisp, invigorating scent of mountain air, carrying hints of pine and the coolness of nearby glaciers or snowcapped peaks. Catch the subtle, smoky aroma of a distant campfire or the scent of burning wood from a nearby village, adding a cozy and nostalgic element to the forests scent profile." "Cup your hands and take a sip of the crisp, refreshing mountain water flowing from streams and rivers, untainted by pollution or impurities. Savor the subtle mineral taste that invigorates your senses, a stark contrast to the often treated or bottled water you may be accustomed to. Pluck a ripe, wild berry from a nearby bush and relish the burst of sweet-tart flavor, with a hint of tanginess that dances on your tongue. Gather a few edible leaves and herbs, gently chewing on them to experience the earthy, slightly bitter, yet invigorating flavors. Drizzle a bit of fresh, wildflower honey onto your tongue, savoring its rich, floral sweetness and velvety texture." "As you wander through the forest, notice the ancient Buddhist monasteries or Hindu temples nestled among the trees or perched on mountainsides, their intricate architecture and vibrant colors standing in contrast to the natural surroundings. Listen for the distant chanting or the ringing of temple bells, carried on the mountain breeze, creating a sense of reverence and spiritual connection. Watch the prayer flags fluttering in the wind, their vibrant colors and inscribed mantras symbolizing the spread of peace and compassion. Feel the presence of revered spiritual figures, such as Buddhist monks or Hindu sadhus, meditating or walking through the forest, their serene demeanor and simple robes embodying the spiritual traditions of the region. Encounter intricately carved prayer wheels or stupas adorned with colorful offerings, representing the spiritual practices and beliefs of the region." "Immerse yourself in the sacred natural elements of the Himalayan region, such as the purifying waters of the rushing mountain streams or the ancient, gnarled trees adorned with prayer flags and offerings. Imagine the sound of chanting or the rhythmic beating of drums accompanying spiritual rituals, evoking a deep sense of connection to the regions rich spiritual heritage. Allow yourself to be transported by the mythical tales and legends of the Himalayas, where enlightened beings and sacred sites are woven into the fabric of the landscape." "As you fully embrace this tranquil Himalayan forest, experience a profound sense of peace, connection, and rejuvenation. Carry this sense of serenity with you as you slowly return to the present moment, knowing that you can revisit this enchanting place whenever you need a moment of calm and inner reflection."

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment