Created
November 4, 2025 18:05
-
-
Save eustlb/30bbcc10e9349a34d857aafed6b30e21 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from transformers import AutoProcessor, HiggsAudioForConditionalGeneration | |
| model_id = "eustlb/higgs-v2" | |
| processor = AutoProcessor.from_pretrained(model_id, device_map="cuda") | |
| processor.tokenizer.pad_token = processor.tokenizer.eos_token | |
| model = HiggsAudioForConditionalGeneration.from_pretrained(model_id, device_map="cuda") | |
| # single speaker smart voice | |
| conversation = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Generate audio following instruction." | |
| } | |
| ], | |
| }, | |
| { | |
| "role": "scene", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Audio is recorded from a quiet room." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years." | |
| } | |
| ] | |
| } | |
| ] | |
| inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000) | |
| inputs = inputs.to(model.device) | |
| outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False) | |
| outputs = processor.batch_decode(outputs) | |
| processor.save_audio(outputs, "output_single_speaker_smart_voice.wav") | |
| # multi speaker smart voice | |
| system_message = """You are an AI assistant designed to convert text into speech. | |
| If the user's message includes a [SPEAKER*] tag, do not read out the tag and generate speech for the following text, using the specified voice. | |
| If no speaker tag is present, select a suitable voice on your own.""" | |
| user_message = """[SPEAKER0] I can't believe you did that without even asking me first! | |
| [SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this. | |
| [SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion! | |
| [SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act.""" | |
| conversation = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": system_message | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "scene", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Audio is recorded from a quiet room." | |
| }, | |
| { | |
| "type": "text", | |
| "text": "SPEAKER0: feminine" | |
| }, | |
| { | |
| "type": "text", | |
| "text": "SPEAKER1: masculine" | |
| }, | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": user_message | |
| } | |
| ] | |
| } | |
| ] | |
| inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000) | |
| inputs = inputs.to(model.device) | |
| outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False) | |
| outputs = processor.batch_decode(outputs) | |
| processor.save_audio(outputs, "output_multi_speaker_smart_voice.wav") | |
| # zero shot voice cloning | |
| conversation = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Generate audio following instruction." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "scene", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Audio is recorded from a quiet room." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": [ | |
| { | |
| "type": "audio", | |
| "url": "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/belinda.wav" | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years." | |
| } | |
| ] | |
| } | |
| ] | |
| inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000) | |
| inputs = inputs.to(model.device) | |
| outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False) | |
| outputs = processor.batch_decode(outputs) | |
| processor.save_audio(outputs, "output_zero_shot_voice_cloning.wav") | |
| # multi speaker voice cloning | |
| conversation = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Generate audio following instruction." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "scene", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Audio is recorded from a quiet room." | |
| }, | |
| { | |
| "type": "text", | |
| "text": "SPEAKER0:" | |
| }, | |
| { | |
| "type": "audio", | |
| "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav" | |
| }, | |
| { | |
| "type": "text", | |
| "text": "SPEAKER1:" | |
| }, | |
| { | |
| "type": "audio", | |
| "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac" | |
| }, | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "[SPEAKER0] I can't believe you did that without even asking me first!" | |
| } | |
| ] | |
| }, | |
| ] | |
| inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000) | |
| inputs = inputs.to(model.device) | |
| outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False) | |
| outputs = processor.batch_decode(outputs) | |
| processor.save_audio(outputs, "output_multi_speaker_voice_cloning.wav") | |
| # batched inference | |
| conversation1 = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Generate audio following instruction." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "scene", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Audio is recorded from a quiet room." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": [ | |
| { | |
| "type": "audio", | |
| "url": "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/belinda.wav" | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years." | |
| } | |
| ] | |
| } | |
| ] | |
| conversation2 = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Generate audio following instruction." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "scene", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Audio is recorded from a quiet room." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": " It's super important to assess fairly the fact that our former model is over. And this is not a question of adjustment. This is not the same world, 2024, 2025. And on top of that, we are making the same mistakes, on top of the key elements I mentioned. We are over-regulating and under-investing. So just if, in the two to three years to come, if we follow our classical agenda, we will be out of the market. I have no doubts." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": [ | |
| { | |
| "type": "audio", | |
| "url": "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/macron.wav" | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Hey, here is a clone from the given voice." | |
| } | |
| ] | |
| } | |
| ] | |
| inputs = processor.apply_chat_template([conversation1, conversation2], add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000) | |
| inputs = inputs.to(model.device) | |
| outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False) | |
| outputs = processor.batch_decode(outputs) | |
| processor.save_audio(outputs, ["output_batched_1.wav", "output_batched_2.wav"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment