cognitivetech · June 10, 2024 10:09
diff --git a/cli.Multi-voice-TTS-GPT-SoVITS.py b/cli.Multi-voice-TTS-GPT-SoVITS.py
 # https://huggingface.co/spaces/Ailyth/Multi-voice-TTS-GPT-SoVITS
 # Clone and follow install instructions in linked colab ^^^^
 # then use this script based on that code
 #
 # python cli.Multi-voice-TTS-GPT-SoVITS.py --audio {input.wav} --text sample.md --language English
 #
 # for sample.md I divided sections by `##` h2 headers, and put paragraphs on each a single line.
 # the output is generated `out/filename/h2-name-00.wav` so each file gets a different folder and each heading gets a different output prefix
 #
 # See sample input here in gist

 import argparse
 import numpy as np
 import soundfile as sf
 import torch
 import librosa
 from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer
 from datetime import datetime
 from time import time as ttime
 from timeit import default_timer as timer
 from polyglot.detect import Detector
 from feature_extractor import cnhubert
 from module.models import SynthesizerTrn
 from module.mel_processing import spectrogram_torch
 from AR.models.t2s_lightning_module import Text2SemanticLightningModule
 from my_utils import load_audio
 import os, re, sys, pytz, random
 import os,re,sys,LangSegment,librosa,pdb,torch,pytz,random
 from text.cleaner import clean_text
 from text import cleaned_text_to_sequence


 if "_CUDA_VISIBLE_DEVICES" in os.environ:
    os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
 tz = pytz.timezone('Asia/Singapore')
 device = "cuda" if torch.cuda.is_available() else "cpu"

 is_half = eval(
    os.environ.get("is_half", "True" if torch.cuda.is_available() else "False")
 )

 splits = {"？", "！", ".", "?", "!", ":", "：", "—", "…", }

 # Add your imports and other necessary code here
 whisper_path = os.environ.get("whisper_path", "pretrained_models/whisper-tiny")
 if not os.path.exists(whisper_path):
    whisper_path = "openai/whisper-tiny"


 def get_first(text):
    pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
    text = re.split(pattern, text)[0].strip()
    return text

 cnhubert_base_path = os.environ.get("cnhubert_base_path", "pretrained_models/chinese-hubert-base")
 bert_path = os.environ.get("bert_path", "pretrained_models/chinese-roberta-wwm-ext-large")

 if not os.path.exists(cnhubert_base_path):
    cnhubert_base_path = "TencentGameMate/chinese-hubert-base"
 if not os.path.exists(bert_path):
    bert_path = "hfl/chinese-roberta-wwm-ext-large"
 cnhubert.cnhubert_base_path = cnhubert_base_path

 tz = pytz.timezone('Asia/Singapore')

 ssl_model = cnhubert.get_model()
 if is_half == True:
    ssl_model = ssl_model.half().to(device)
 else:
    ssl_model = ssl_model.to(device)


 def abs_path(dir):
    global_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    return(os.path.join(global_dir, dir))
 gpt_path = abs_path("MODELS/22/22.ckpt")
 sovits_path=abs_path("MODELS/22/22.pth")
 cnhubert_base_path = os.environ.get("cnhubert_base_path", "pretrained_models/chinese-hubert-base")
 bert_path = os.environ.get("bert_path", "pretrained_models/chinese-roberta-wwm-ext-large")

 def change_gpt_weights(gpt_path):
    global hz, max_sec, t2s_model, config
    hz = 50
    dict_s1 = torch.load(gpt_path, map_location="cpu")
    config = dict_s1["config"]
    max_sec = config["data"]["max_sec"]
    t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
    t2s_model.load_state_dict(dict_s1["weight"])
    if is_half == True:
        t2s_model = t2s_model.half()
    t2s_model = t2s_model.to(device)
    t2s_model.eval()
    total = sum([param.nelement() for param in t2s_model.parameters()])
    print("Number of parameter: %.2fM" % (total / 1e6))
    with open("./gweight.txt", "w", encoding="utf-8") as f: f.write(gpt_path)

 change_gpt_weights(gpt_path)


 dict_language = {
    ("中文1"): "all_zh",#全部按中文识别
    ("English"): "en",#全部按英文识别#######不变
    ("日文1"): "all_ja",#全部按日文识别
    ("中文"): "zh",#按中英混合识别####不变
    ("日本語"): "ja",#按日英混合识别####不变
    ("混合"): "auto",#多语种启动切分识别语种
 }

 class DictToAttrRecursive(dict):
    def __init__(self, input_dict):
        super().__init__(input_dict)
        for key, value in input_dict.items():
            if isinstance(value, dict):
                value = DictToAttrRecursive(value)
            self[key] = value
            setattr(self, key, value)

    def __getattr__(self, item):
        try:
            return self[item]
        except KeyError:
            raise AttributeError(f"Attribute {item} not found")

    def __setattr__(self, key, value):
        if isinstance(value, dict):
            value = DictToAttrRecursive(value)
        super(DictToAttrRecursive, self).__setitem__(key, value)
        super().__setattr__(key, value)

    def __delattr__(self, item):
        try:
            del self[item]
        except KeyError:
            raise AttributeError(f"Attribute {item} not found")

 def change_sovits_weights(sovits_path):
    global vq_model, hps
    dict_s2 = torch.load(sovits_path, map_location="cpu")
    hps = dict_s2["config"]
    hps = DictToAttrRecursive(hps)
    hps.model.semantic_frame_rate = "25hz"
    vq_model = SynthesizerTrn(
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        n_speakers=hps.data.n_speakers,
        **hps.model
    )
    if ("pretrained" not in sovits_path):
        del vq_model.enc_q
    if is_half == True:
        vq_model = vq_model.half().to(device)
    else:
        vq_model = vq_model.to(device)
    vq_model.eval()
    print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
    with open("./sweight.txt", "w", encoding="utf-8") as f:
        f.write(sovits_path)


 change_sovits_weights(sovits_path)

 pipe = pipeline(
    task="automatic-speech-recognition",
    model=whisper_path,
    chunk_length_s=30,
    device=device,)

 def duration(audio_file_path):
    return True

 def trim_text(text,language): 
    limit_cj = 120 #character
    limit_en = 200 #words  
    search_limit_cj = limit_cj+30
    search_limit_en = limit_en +30
    text = text.replace('\n', '').strip()
    
    if language =='English':
        words = text.split()
        if len(words) <= limit_en:
            return text
        # English
        for i in range(limit_en, -1, -1):
            if any(punct in words[i] for punct in splits):
                return ' '.join(words[:i+1])
        for i in range(limit_en, min(len(words), search_limit_en)):
            if any(punct in words[i] for punct in splits):
                return ' '.join(words[:i+1])
        return ' '.join(words[:limit_en])
        
    else:#中文日文
        if len(text) <= limit_cj:
            return text
        for i in range(limit_cj, -1, -1):  
            if text[i] in splits:
                return text[:i+1]
        for i in range(limit_cj, min(len(text), search_limit_cj)):  
            if text[i] in splits:
                return text[:i+1]
        return text[:limit_cj]   

 def transcribe(voice):
    time1=timer()
    tprint('⚡Start Clone - transcribe')
    task="transcribe"
    if voice is None:
        wprint("No audio file submitted! Please upload or record an audio file before submitting your request.")
    R = pipe(voice, batch_size=8, generate_kwargs={"task": task}, return_timestamps=True,return_language=True)
    text=R['text']
    lang=R['chunks'][0]['language']
    if lang=='english':
      language='English'
    elif lang =='chinese':
      language='中文'
    elif lang=='japanese':
      language = '日本語'

    time2=timer()
    tprint(f'transcribe COMPLETE,{round(time2-time1,4)}s')
    tprint(f'\nTRANSCRIBE RESULT：\n 🔣Language：{language} \n 🔣Text：{text}' )
    return  text,language  

 def tprint(text):
    now=datetime.now(tz).strftime('%H:%M:%S')
    print(f'UTC+8 - {now} - {text}')

 def get_cleaned_text_final(text,language):
    if language in {"en","all_zh","all_ja"}:
        phones, word2ph, norm_text = clean_text_inf(text, language)
    elif language in {"zh", "ja","auto"}:
        phones, word2ph, norm_text = nonen_clean_text_inf(text, language)
    return phones, word2ph, norm_text

 def get_bert_final(phones, word2ph, text,language,device):
    if language == "en":
        bert = get_bert_inf(phones, word2ph, text, language)
    elif language in {"zh", "ja","auto"}:
        bert = nonen_get_bert_inf(text, language)
    elif language == "all_zh":
        bert = get_bert_feature(text, word2ph).to(device)
    else:
        bert = torch.zeros((1024, len(phones))).to(device)
    return bert

 def get_spepc(hps, filename):
    audio = load_audio(filename, int(hps.data.sampling_rate))
    audio = torch.FloatTensor(audio)
    audio_norm = audio
    audio_norm = audio_norm.unsqueeze(0)
    spec = spectrogram_torch(
        audio_norm,
        hps.data.filter_length,
        hps.data.sampling_rate,
        hps.data.hop_length,
        hps.data.win_length,
        center=False,
    )
    return spec

 dtype=torch.float16 if is_half == True else torch.float32
 def get_bert_inf(phones, word2ph, norm_text, language):
    language=language.replace("all_","")
    if language == "zh":
        bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype)
    else:
        bert = torch.zeros(
            (1024, len(phones)),
            dtype=torch.float16 if is_half == True else torch.float32,
        ).to(device)

    return bert

 def merge_short_text_in_array(texts, threshold):
    if (len(texts)) < 2:
        return texts
    result = []
    text = ""
    for ele in texts:
        text += ele
        if len(text) >= threshold:
            result.append(text)
            text = ""
    if (len(text) > 0):
        if len(result) == 0:
            result.append(text)
        else:
            result[len(result) - 1] += text
    return result

 def clean_text_inf(text, language):
    formattext = ""
    language = language.replace("all_","")
    for tmp in LangSegment.getTexts(text):
        if language == "ja":
            if tmp["lang"] == language or tmp["lang"] == "zh":
                formattext += tmp["text"] + " "
            continue
        if tmp["lang"] == language:
            formattext += tmp["text"] + " "
    while "  " in formattext:
        formattext = formattext.replace("  ", " ")
    phones, word2ph, norm_text = clean_text(formattext, language)
    phones = cleaned_text_to_sequence(phones)
    return phones, word2ph, norm_text

 def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=("Do not split"), volume_scale=1.0):
    if not duration(ref_wav_path):
        return None
    if  text == '':
        wprint("Please enter text to generate/请输入生成文字")
        return None
    t0 = ttime()
    startTime=timer()
    text=trim_text(text,text_language)
    change_sovits_weights(sovits_path)
    tprint(f'🏕️LOADED SoVITS Model: {sovits_path}')
    change_gpt_weights(gpt_path)
    tprint(f'🏕️LOADED GPT Model: {gpt_path}')

    prompt_language = dict_language[prompt_language]
    try:
        text_language = dict_language[text_language]
    except KeyError as e:
        wprint(f"Unsupported language type: {e}")
        return None
        
    prompt_text = prompt_text.strip("\n")
    if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
    text = text.strip("\n")
    if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
    #print(("实际输入的参考文本:"), prompt_text)
    #print(("📝实际输入的目标文本:"), text)
    zero_wav = np.zeros(
        int(hps.data.sampling_rate * 0.3),
        dtype=np.float16 if is_half == True else np.float32,
    )
    with torch.no_grad():
        wav16k, sr = librosa.load(ref_wav_path, sr=16000)
        if (wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000):
            errinfo='参考音频在3~10秒范围外，请更换！'
            raise OSError((errinfo))
        wav16k = torch.from_numpy(wav16k)
        zero_wav_torch = torch.from_numpy(zero_wav)
        if is_half == True:
            wav16k = wav16k.half().to(device)
            zero_wav_torch = zero_wav_torch.half().to(device)
        else:
            wav16k = wav16k.to(device)
            zero_wav_torch = zero_wav_torch.to(device)
        wav16k = torch.cat([wav16k, zero_wav_torch])
        ssl_content = ssl_model.model(wav16k.unsqueeze(0))[
            "last_hidden_state"
        ].transpose(
            1, 2
        )  # .float()
        codes = vq_model.extract_latent(ssl_content)
        prompt_semantic = codes[0, 0]
    t1 = ttime()

    phones1, word2ph1, norm_text1=get_cleaned_text_final(prompt_text, prompt_language)

    if (how_to_cut == ("Split into groups of 4 sentences")):
        text = cut1(text)
    elif (how_to_cut == ("Split every 50 characters")):
        text = cut2(text)
    elif (how_to_cut == ("Split at CN/JP periods (。)")):
        text = cut3(text)
    elif (how_to_cut == ("Split at English periods (.)")):
        text = cut4(text)
    elif (how_to_cut == ("Split at punctuation marks")):
        text = cut5(text)
    while "\n\n" in text:
        text = text.replace("\n\n", "\n")
    print(f"🧨实际输入的目标文本(切句后):{text}\n")
    texts = text.split("\n")
    texts = merge_short_text_in_array(texts, 5)
    audio_opt = []
    bert1=get_bert_final(phones1, word2ph1, norm_text1,prompt_language,device).to(dtype)

    for text in texts:
        if (len(text.strip()) == 0):
            continue
        if (text[-1] not in splits): text += "。" if text_language != "en" else "."
        print(("\n🎈实际输入的目标文本(每句):"), text)
        phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
        try:
            bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
        except RuntimeError as e:
            wprint(f"The input text does not match the language/输入文本与语言不匹配: {e}")
            return None
        bert = torch.cat([bert1, bert2], 1)

        all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
        bert = bert.to(device).unsqueeze(0)
        all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
        prompt = prompt_semantic.unsqueeze(0).to(device)
        t2 = ttime()
        with torch.no_grad():
            # pred_semantic = t2s_model.model.infer(
            pred_semantic, idx = t2s_model.model.infer_panel(
                all_phoneme_ids,
                all_phoneme_len,
                prompt,
                bert,
                # prompt_phone_len=ph_offset,
                top_k=config["inference"]["top_k"],
                early_stop_num=hz * max_sec,
            )
        t3 = ttime()
        # print(pred_semantic.shape,idx)
        pred_semantic = pred_semantic[:, -idx:].unsqueeze(
            0
        )  # .unsqueeze(0)#mq要多unsqueeze一次
        refer = get_spepc(hps, ref_wav_path)  # .to(device)
        if is_half == True:
            refer = refer.half().to(device)
        else:
            refer = refer.to(device)
        # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
        try:
          audio = (
            vq_model.decode(
                pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer
            )
                .detach()
                .cpu()
                .numpy()[0, 0]
        ) 
        except RuntimeError as e:
            wprint(f"The input text does not match the language/输入文本与语言不匹配: {e}")
            return None

        max_audio=np.abs(audio).max()
        if max_audio>1:audio/=max_audio
        audio_opt.append(audio)
        audio_opt.append(zero_wav)
        t4 = ttime()
    print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
    #yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
    audio_data = (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
    
    audio_data = (audio_data.astype(np.float32) * volume_scale).astype(np.int16)
    output_wav = "output_audio.wav"  
    sf.write(output_wav, audio_data, hps.data.sampling_rate)
    endTime=timer()
    tprint(f'🆗TTS COMPLETE,{round(endTime-startTime,4)}s')
    return output_wav


 def clone_voice(user_voice, user_text, user_lang):
    if not duration(user_voice):
        return None
    if user_text == '':
        wprint("Please enter text to generate/请输入生成文字")
        return None
    user_text = trim_text(user_text, user_lang)
    time1 = timer()
    global gpt_path, sovits_path
    gpt_path = abs_path("pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
    sovits_path = abs_path("pretrained_models/s2G488k.pth")
    try:
        prompt_text, prompt_language = transcribe(user_voice)
    except UnboundLocalError as e:
        wprint(f"The language in the audio cannot be recognized ：{str(e)}")
        return None

    output_wav = get_tts_wav(
        user_voice,
        prompt_text,
        prompt_language,
        user_text,
        user_lang,
        how_to_cut="Do not split",
        volume_scale=1.0)
    time2 = timer()
    tprint(f'🆗CLONE COMPLETE,{round(time2-time1,4)}s')
    return output_wav

 def process_text_file(file_path, user_voice, user_lang):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    current_heading = None
    file_counter = 0

    # Extract the base name of the input text file (without extension)
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    output_dir = os.path.join('out', base_name)
    os.makedirs(output_dir, exist_ok=True)

    for line in lines:
        line = line.strip()
        if line.startswith('##'):
            current_heading = line[2:].strip().lower().replace(' ', '-')
            file_counter = 0
        elif current_heading:
            output_wav = clone_voice(user_voice, line, user_lang)
            if output_wav:
                output_filename = f"{output_dir}/{current_heading}-{file_counter:02d}.wav"
                os.rename(output_wav, output_filename)
                print(f"Generated audio saved to: {output_filename}")
                file_counter += 1

 def main():
    parser = argparse.ArgumentParser(description="Clone custom voice from terminal")
    parser.add_argument("--audio", type=str, required=True, help="Path to the source audio file")
    parser.add_argument("--text", type=str, required=True, help="Path to the text file to read")
    parser.add_argument("--language", type=str, default="English", help="Language of the text (default: English)")

    args = parser.parse_args()

    user_voice = args.audio
    user_text_file = args.text
    user_lang = args.language

    process_text_file(user_text_file, user_voice, user_lang)

 if __name__ == "__main__":
    main()
diff --git a/sample.md b/sample.md
	# https://huggingface.co/spaces/Ailyth/Multi-voice-TTS-GPT-SoVITS
	# Clone and follow install instructions in linked colab ^^^^
	# then use this script based on that code
	#
	# python cli.Multi-voice-TTS-GPT-SoVITS.py --audio {input.wav} --text sample.md --language English
	#
	# for sample.md I divided sections by `##` h2 headers, and put paragraphs on each a single line.
	# the output is generated `out/filename/h2-name-00.wav` so each file gets a different folder and each heading gets a different output prefix
	#
	# See sample input here in gist

	import argparse
	import numpy as np
	import soundfile as sf
	import torch
	import librosa
	from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer
	from datetime import datetime
	from time import time as ttime
	from timeit import default_timer as timer
	from polyglot.detect import Detector
	from feature_extractor import cnhubert
	from module.models import SynthesizerTrn
	from module.mel_processing import spectrogram_torch
	from AR.models.t2s_lightning_module import Text2SemanticLightningModule
	from my_utils import load_audio
	import os, re, sys, pytz, random
	import os,re,sys,LangSegment,librosa,pdb,torch,pytz,random
	from text.cleaner import clean_text
	from text import cleaned_text_to_sequence


	if "_CUDA_VISIBLE_DEVICES" in os.environ:
	os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
	tz = pytz.timezone('Asia/Singapore')
	device = "cuda" if torch.cuda.is_available() else "cpu"

	is_half = eval(
	os.environ.get("is_half", "True" if torch.cuda.is_available() else "False")
	)

	splits = {"？", "！", ".", "?", "!", ":", "：", "—", "…", }

	# Add your imports and other necessary code here
	whisper_path = os.environ.get("whisper_path", "pretrained_models/whisper-tiny")
	if not os.path.exists(whisper_path):
	whisper_path = "openai/whisper-tiny"


	def get_first(text):
	pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
	text = re.split(pattern, text)[0].strip()
	return text

	cnhubert_base_path = os.environ.get("cnhubert_base_path", "pretrained_models/chinese-hubert-base")
	bert_path = os.environ.get("bert_path", "pretrained_models/chinese-roberta-wwm-ext-large")

	if not os.path.exists(cnhubert_base_path):
	cnhubert_base_path = "TencentGameMate/chinese-hubert-base"
	if not os.path.exists(bert_path):
	bert_path = "hfl/chinese-roberta-wwm-ext-large"
	cnhubert.cnhubert_base_path = cnhubert_base_path

	tz = pytz.timezone('Asia/Singapore')

	ssl_model = cnhubert.get_model()
	if is_half == True:
	ssl_model = ssl_model.half().to(device)
	else:
	ssl_model = ssl_model.to(device)


	def abs_path(dir):
	global_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
	return(os.path.join(global_dir, dir))
	gpt_path = abs_path("MODELS/22/22.ckpt")
	sovits_path=abs_path("MODELS/22/22.pth")
	cnhubert_base_path = os.environ.get("cnhubert_base_path", "pretrained_models/chinese-hubert-base")
	bert_path = os.environ.get("bert_path", "pretrained_models/chinese-roberta-wwm-ext-large")

	def change_gpt_weights(gpt_path):
	global hz, max_sec, t2s_model, config
	hz = 50
	dict_s1 = torch.load(gpt_path, map_location="cpu")
	config = dict_s1["config"]
	max_sec = config["data"]["max_sec"]
	t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
	t2s_model.load_state_dict(dict_s1["weight"])
	if is_half == True:
	t2s_model = t2s_model.half()
	t2s_model = t2s_model.to(device)
	t2s_model.eval()
	total = sum([param.nelement() for param in t2s_model.parameters()])
	print("Number of parameter: %.2fM" % (total / 1e6))
	with open("./gweight.txt", "w", encoding="utf-8") as f: f.write(gpt_path)

	change_gpt_weights(gpt_path)


	dict_language = {
	("中文1"): "all_zh",#全部按中文识别
	("English"): "en",#全部按英文识别#######不变
	("日文1"): "all_ja",#全部按日文识别
	("中文"): "zh",#按中英混合识别####不变
	("日本語"): "ja",#按日英混合识别####不变
	("混合"): "auto",#多语种启动切分识别语种
	}

	class DictToAttrRecursive(dict):
	def __init__(self, input_dict):
	super().__init__(input_dict)
	for key, value in input_dict.items():
	if isinstance(value, dict):
	value = DictToAttrRecursive(value)
	self[key] = value
	setattr(self, key, value)

	def __getattr__(self, item):
	try:
	return self[item]
	except KeyError:
	raise AttributeError(f"Attribute {item} not found")

	def __setattr__(self, key, value):
	if isinstance(value, dict):
	value = DictToAttrRecursive(value)
	super(DictToAttrRecursive, self).__setitem__(key, value)
	super().__setattr__(key, value)

	def __delattr__(self, item):
	try:
	del self[item]
	except KeyError:
	raise AttributeError(f"Attribute {item} not found")

	def change_sovits_weights(sovits_path):
	global vq_model, hps
	dict_s2 = torch.load(sovits_path, map_location="cpu")
	hps = dict_s2["config"]
	hps = DictToAttrRecursive(hps)
	hps.model.semantic_frame_rate = "25hz"
	vq_model = SynthesizerTrn(
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	n_speakers=hps.data.n_speakers,
	**hps.model
	)
	if ("pretrained" not in sovits_path):
	del vq_model.enc_q
	if is_half == True:
	vq_model = vq_model.half().to(device)
	else:
	vq_model = vq_model.to(device)
	vq_model.eval()
	print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
	with open("./sweight.txt", "w", encoding="utf-8") as f:
	f.write(sovits_path)


	change_sovits_weights(sovits_path)

	pipe = pipeline(
	task="automatic-speech-recognition",
	model=whisper_path,
	chunk_length_s=30,
	device=device,)

	def duration(audio_file_path):
	return True

	def trim_text(text,language):
	limit_cj = 120 #character
	limit_en = 200 #words
	search_limit_cj = limit_cj+30
	search_limit_en = limit_en +30
	text = text.replace('\n', '').strip()

	if language =='English':
	words = text.split()
	if len(words) <= limit_en:
	return text
	# English
	for i in range(limit_en, -1, -1):
	if any(punct in words[i] for punct in splits):
	return ' '.join(words[:i+1])
	for i in range(limit_en, min(len(words), search_limit_en)):
	if any(punct in words[i] for punct in splits):
	return ' '.join(words[:i+1])
	return ' '.join(words[:limit_en])

	else:#中文日文
	if len(text) <= limit_cj:
	return text
	for i in range(limit_cj, -1, -1):
	if text[i] in splits:
	return text[:i+1]
	for i in range(limit_cj, min(len(text), search_limit_cj)):
	if text[i] in splits:
	return text[:i+1]
	return text[:limit_cj]

	def transcribe(voice):
	time1=timer()
	tprint('⚡Start Clone - transcribe')
	task="transcribe"
	if voice is None:
	wprint("No audio file submitted! Please upload or record an audio file before submitting your request.")
	R = pipe(voice, batch_size=8, generate_kwargs={"task": task}, return_timestamps=True,return_language=True)
	text=R['text']
	lang=R['chunks'][0]['language']
	if lang=='english':
	language='English'
	elif lang =='chinese':
	language='中文'
	elif lang=='japanese':
	language = '日本語'

	time2=timer()
	tprint(f'transcribe COMPLETE,{round(time2-time1,4)}s')
	tprint(f'\nTRANSCRIBE RESULT：\n 🔣Language：{language} \n 🔣Text：{text}' )
	return text,language

	def tprint(text):
	now=datetime.now(tz).strftime('%H:%M:%S')
	print(f'UTC+8 - {now} - {text}')

	def get_cleaned_text_final(text,language):
	if language in {"en","all_zh","all_ja"}:
	phones, word2ph, norm_text = clean_text_inf(text, language)
	elif language in {"zh", "ja","auto"}:
	phones, word2ph, norm_text = nonen_clean_text_inf(text, language)
	return phones, word2ph, norm_text

	def get_bert_final(phones, word2ph, text,language,device):
	if language == "en":
	bert = get_bert_inf(phones, word2ph, text, language)
	elif language in {"zh", "ja","auto"}:
	bert = nonen_get_bert_inf(text, language)
	elif language == "all_zh":
	bert = get_bert_feature(text, word2ph).to(device)
	else:
	bert = torch.zeros((1024, len(phones))).to(device)
	return bert

	def get_spepc(hps, filename):
	audio = load_audio(filename, int(hps.data.sampling_rate))
	audio = torch.FloatTensor(audio)
	audio_norm = audio
	audio_norm = audio_norm.unsqueeze(0)
	spec = spectrogram_torch(
	audio_norm,
	hps.data.filter_length,
	hps.data.sampling_rate,
	hps.data.hop_length,
	hps.data.win_length,
	center=False,
	)
	return spec

	dtype=torch.float16 if is_half == True else torch.float32
	def get_bert_inf(phones, word2ph, norm_text, language):
	language=language.replace("all_","")
	if language == "zh":
	bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype)
	else:
	bert = torch.zeros(
	(1024, len(phones)),
	dtype=torch.float16 if is_half == True else torch.float32,
	).to(device)

	return bert

	def merge_short_text_in_array(texts, threshold):
	if (len(texts)) < 2:
	return texts
	result = []
	text = ""
	for ele in texts:
	text += ele
	if len(text) >= threshold:
	result.append(text)
	text = ""
	if (len(text) > 0):
	if len(result) == 0:
	result.append(text)
	else:
	result[len(result) - 1] += text
	return result

	def clean_text_inf(text, language):
	formattext = ""
	language = language.replace("all_","")
	for tmp in LangSegment.getTexts(text):
	if language == "ja":
	if tmp["lang"] == language or tmp["lang"] == "zh":
	formattext += tmp["text"] + " "
	continue
	if tmp["lang"] == language:
	formattext += tmp["text"] + " "
	while " " in formattext:
	formattext = formattext.replace(" ", " ")
	phones, word2ph, norm_text = clean_text(formattext, language)
	phones = cleaned_text_to_sequence(phones)
	return phones, word2ph, norm_text

	def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=("Do not split"), volume_scale=1.0):
	if not duration(ref_wav_path):
	return None
	if text == '':
	wprint("Please enter text to generate/请输入生成文字")
	return None
	t0 = ttime()
	startTime=timer()
	text=trim_text(text,text_language)
	change_sovits_weights(sovits_path)
	tprint(f'🏕️LOADED SoVITS Model: {sovits_path}')
	change_gpt_weights(gpt_path)
	tprint(f'🏕️LOADED GPT Model: {gpt_path}')

	prompt_language = dict_language[prompt_language]
	try:
	text_language = dict_language[text_language]
	except KeyError as e:
	wprint(f"Unsupported language type: {e}")
	return None

	prompt_text = prompt_text.strip("\n")
	if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
	text = text.strip("\n")
	if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
	#print(("实际输入的参考文本:"), prompt_text)
	#print(("📝实际输入的目标文本:"), text)
	zero_wav = np.zeros(
	int(hps.data.sampling_rate * 0.3),
	dtype=np.float16 if is_half == True else np.float32,
	)
	with torch.no_grad():
	wav16k, sr = librosa.load(ref_wav_path, sr=16000)
	if (wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000):
	errinfo='参考音频在3~10秒范围外，请更换！'
	raise OSError((errinfo))
	wav16k = torch.from_numpy(wav16k)
	zero_wav_torch = torch.from_numpy(zero_wav)
	if is_half == True:
	wav16k = wav16k.half().to(device)
	zero_wav_torch = zero_wav_torch.half().to(device)
	else:
	wav16k = wav16k.to(device)
	zero_wav_torch = zero_wav_torch.to(device)
	wav16k = torch.cat([wav16k, zero_wav_torch])
	ssl_content = ssl_model.model(wav16k.unsqueeze(0))[
	"last_hidden_state"
	].transpose(
	1, 2
	) # .float()
	codes = vq_model.extract_latent(ssl_content)
	prompt_semantic = codes[0, 0]
	t1 = ttime()

	phones1, word2ph1, norm_text1=get_cleaned_text_final(prompt_text, prompt_language)

	if (how_to_cut == ("Split into groups of 4 sentences")):
	text = cut1(text)
	elif (how_to_cut == ("Split every 50 characters")):
	text = cut2(text)
	elif (how_to_cut == ("Split at CN/JP periods (。)")):
	text = cut3(text)
	elif (how_to_cut == ("Split at English periods (.)")):
	text = cut4(text)
	elif (how_to_cut == ("Split at punctuation marks")):
	text = cut5(text)
	while "\n\n" in text:
	text = text.replace("\n\n", "\n")
	print(f"🧨实际输入的目标文本(切句后):{text}\n")
	texts = text.split("\n")
	texts = merge_short_text_in_array(texts, 5)
	audio_opt = []
	bert1=get_bert_final(phones1, word2ph1, norm_text1,prompt_language,device).to(dtype)

	for text in texts:
	if (len(text.strip()) == 0):
	continue
	if (text[-1] not in splits): text += "。" if text_language != "en" else "."
	print(("\n🎈实际输入的目标文本(每句):"), text)
	phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
	try:
	bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
	except RuntimeError as e:
	wprint(f"The input text does not match the language/输入文本与语言不匹配: {e}")
	return None
	bert = torch.cat([bert1, bert2], 1)

	all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
	bert = bert.to(device).unsqueeze(0)
	all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
	prompt = prompt_semantic.unsqueeze(0).to(device)
	t2 = ttime()
	with torch.no_grad():
	# pred_semantic = t2s_model.model.infer(
	pred_semantic, idx = t2s_model.model.infer_panel(
	all_phoneme_ids,
	all_phoneme_len,
	prompt,
	bert,
	# prompt_phone_len=ph_offset,
	top_k=config["inference"]["top_k"],
	early_stop_num=hz * max_sec,
	)
	t3 = ttime()
	# print(pred_semantic.shape,idx)
	pred_semantic = pred_semantic[:, -idx:].unsqueeze(
	0
	) # .unsqueeze(0)#mq要多unsqueeze一次
	refer = get_spepc(hps, ref_wav_path) # .to(device)
	if is_half == True:
	refer = refer.half().to(device)
	else:
	refer = refer.to(device)
	# audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
	try:
	audio = (
	vq_model.decode(
	pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer
	)
	.detach()
	.cpu()
	.numpy()[0, 0]
	)
	except RuntimeError as e:
	wprint(f"The input text does not match the language/输入文本与语言不匹配: {e}")
	return None

	max_audio=np.abs(audio).max()
	if max_audio>1:audio/=max_audio
	audio_opt.append(audio)
	audio_opt.append(zero_wav)
	t4 = ttime()
	print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
	#yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
	audio_data = (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)

	audio_data = (audio_data.astype(np.float32) * volume_scale).astype(np.int16)
	output_wav = "output_audio.wav"
	sf.write(output_wav, audio_data, hps.data.sampling_rate)
	endTime=timer()
	tprint(f'🆗TTS COMPLETE,{round(endTime-startTime,4)}s')
	return output_wav


	def clone_voice(user_voice, user_text, user_lang):
	if not duration(user_voice):
	return None
	if user_text == '':
	wprint("Please enter text to generate/请输入生成文字")
	return None
	user_text = trim_text(user_text, user_lang)
	time1 = timer()
	global gpt_path, sovits_path
	gpt_path = abs_path("pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
	sovits_path = abs_path("pretrained_models/s2G488k.pth")
	try:
	prompt_text, prompt_language = transcribe(user_voice)
	except UnboundLocalError as e:
	wprint(f"The language in the audio cannot be recognized ：{str(e)}")
	return None

	output_wav = get_tts_wav(
	user_voice,
	prompt_text,
	prompt_language,
	user_text,
	user_lang,
	how_to_cut="Do not split",
	volume_scale=1.0)
	time2 = timer()
	tprint(f'🆗CLONE COMPLETE,{round(time2-time1,4)}s')
	return output_wav

	def process_text_file(file_path, user_voice, user_lang):
	with open(file_path, 'r') as file:
	lines = file.readlines()

	current_heading = None
	file_counter = 0

	# Extract the base name of the input text file (without extension)
	base_name = os.path.splitext(os.path.basename(file_path))[0]
	output_dir = os.path.join('out', base_name)
	os.makedirs(output_dir, exist_ok=True)

	for line in lines:
	line = line.strip()
	if line.startswith('##'):
	current_heading = line[2:].strip().lower().replace(' ', '-')
	file_counter = 0
	elif current_heading:
	output_wav = clone_voice(user_voice, line, user_lang)
	if output_wav:
	output_filename = f"{output_dir}/{current_heading}-{file_counter:02d}.wav"
	os.rename(output_wav, output_filename)
	print(f"Generated audio saved to: {output_filename}")
	file_counter += 1

	def main():
	parser = argparse.ArgumentParser(description="Clone custom voice from terminal")
	parser.add_argument("--audio", type=str, required=True, help="Path to the source audio file")
	parser.add_argument("--text", type=str, required=True, help="Path to the text file to read")
	parser.add_argument("--language", type=str, default="English", help="Language of the text (default: English)")

	args = parser.parse_args()

	user_voice = args.audio
	user_text_file = args.text
	user_lang = args.language

	process_text_file(user_text_file, user_voice, user_lang)

	if __name__ == "__main__":
	main()