Skip to content

Instantly share code, notes, and snippets.

@eustlb
Created November 4, 2025 18:05
Show Gist options
  • Select an option

  • Save eustlb/30bbcc10e9349a34d857aafed6b30e21 to your computer and use it in GitHub Desktop.

Select an option

Save eustlb/30bbcc10e9349a34d857aafed6b30e21 to your computer and use it in GitHub Desktop.
from transformers import AutoProcessor, HiggsAudioForConditionalGeneration
model_id = "eustlb/higgs-v2"
processor = AutoProcessor.from_pretrained(model_id, device_map="cuda")
processor.tokenizer.pad_token = processor.tokenizer.eos_token
model = HiggsAudioForConditionalGeneration.from_pretrained(model_id, device_map="cuda")
# single speaker smart voice
conversation = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "Generate audio following instruction."
}
],
},
{
"role": "scene",
"content": [
{
"type": "text",
"text": "Audio is recorded from a quiet room."
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
}
]
}
]
inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
inputs = inputs.to(model.device)
outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
outputs = processor.batch_decode(outputs)
processor.save_audio(outputs, "output_single_speaker_smart_voice.wav")
# multi speaker smart voice
system_message = """You are an AI assistant designed to convert text into speech.
If the user's message includes a [SPEAKER*] tag, do not read out the tag and generate speech for the following text, using the specified voice.
If no speaker tag is present, select a suitable voice on your own."""
user_message = """[SPEAKER0] I can't believe you did that without even asking me first!
[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.
[SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!
[SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act."""
conversation = [
{
"role": "system",
"content": [
{
"type": "text",
"text": system_message
}
]
},
{
"role": "scene",
"content": [
{
"type": "text",
"text": "Audio is recorded from a quiet room."
},
{
"type": "text",
"text": "SPEAKER0: feminine"
},
{
"type": "text",
"text": "SPEAKER1: masculine"
},
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": user_message
}
]
}
]
inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
inputs = inputs.to(model.device)
outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
outputs = processor.batch_decode(outputs)
processor.save_audio(outputs, "output_multi_speaker_smart_voice.wav")
# zero shot voice cloning
conversation = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "Generate audio following instruction."
}
]
},
{
"role": "scene",
"content": [
{
"type": "text",
"text": "Audio is recorded from a quiet room."
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year."
}
]
},
{
"role": "assistant",
"content": [
{
"type": "audio",
"url": "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/belinda.wav"
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
}
]
}
]
inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
inputs = inputs.to(model.device)
outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
outputs = processor.batch_decode(outputs)
processor.save_audio(outputs, "output_zero_shot_voice_cloning.wav")
# multi speaker voice cloning
conversation = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "Generate audio following instruction."
}
]
},
{
"role": "scene",
"content": [
{
"type": "text",
"text": "Audio is recorded from a quiet room."
},
{
"type": "text",
"text": "SPEAKER0:"
},
{
"type": "audio",
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"
},
{
"type": "text",
"text": "SPEAKER1:"
},
{
"type": "audio",
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"
},
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "[SPEAKER0] I can't believe you did that without even asking me first!"
}
]
},
]
inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
inputs = inputs.to(model.device)
outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
outputs = processor.batch_decode(outputs)
processor.save_audio(outputs, "output_multi_speaker_voice_cloning.wav")
# batched inference
conversation1 = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "Generate audio following instruction."
}
]
},
{
"role": "scene",
"content": [
{
"type": "text",
"text": "Audio is recorded from a quiet room."
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year."
}
]
},
{
"role": "assistant",
"content": [
{
"type": "audio",
"url": "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/belinda.wav"
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
}
]
}
]
conversation2 = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "Generate audio following instruction."
}
]
},
{
"role": "scene",
"content": [
{
"type": "text",
"text": "Audio is recorded from a quiet room."
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": " It's super important to assess fairly the fact that our former model is over. And this is not a question of adjustment. This is not the same world, 2024, 2025. And on top of that, we are making the same mistakes, on top of the key elements I mentioned. We are over-regulating and under-investing. So just if, in the two to three years to come, if we follow our classical agenda, we will be out of the market. I have no doubts."
}
]
},
{
"role": "assistant",
"content": [
{
"type": "audio",
"url": "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/macron.wav"
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Hey, here is a clone from the given voice."
}
]
}
]
inputs = processor.apply_chat_template([conversation1, conversation2], add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
inputs = inputs.to(model.device)
outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
outputs = processor.batch_decode(outputs)
processor.save_audio(outputs, ["output_batched_1.wav", "output_batched_2.wav"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment