Skip to content

Instantly share code, notes, and snippets.

@eustlb
Created November 4, 2025 08:29
Show Gist options
  • Select an option

  • Save eustlb/d7e55b17dc089511ae9cf826eaf3b570 to your computer and use it in GitHub Desktop.

Select an option

Save eustlb/d7e55b17dc089511ae9cf826eaf3b570 to your computer and use it in GitHub Desktop.
from transformers import AutoProcessor
from transformers.models.mllama.image_processing_mllama import convert_aspect_ratios_to_ids
# Load the chat template from file
with open("/Users/eustachelebihan/dev/add-higgs-v2/tmp/chat_template.jinja", "r") as f:
chat_template = f.read()
# Load expected outputs for comparison
with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/single_speaker_with_smart_voice.txt", "r") as f:
expected_single_speaker_with_smart_voice = f.read()
with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/multi_speaker_with_smart_voice.txt", "r") as f:
expected_multi_speaker_with_smart_voice = f.read()
with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/zero_shot_voice_cloning.txt", "r") as f:
expected_zero_shot_voice_cloning = f.read()
with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/multi_speaker_with_voice_clone.txt", "r") as f:
expected_multi_speaker_with_voice_clone = f.read()
def test(inputs, expected, test_name):
try:
assert inputs == expected, f"{test_name} failed"
except AssertionError:
print(inputs)
raise
print(f"{test_name} passed")
processor = AutoProcessor.from_pretrained("./tmp-proc")
processor.chat_template = chat_template
# ============================
conversation1 = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "Generate audio following instruction."
}
],
},
{
"role": "scene",
"content": [
{
"type": "text",
"text": "Audio is recorded from a quiet room."
}
]
},
{
"role": "user",
"content": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
}
]
inputs1 = processor.apply_chat_template(conversation1, add_generation_prompt=True)
test(inputs1, expected_single_speaker_with_smart_voice, "Single speaker with smart voice")
# ============================
# ============================
system_message = """You are an AI assistant designed to convert text into speech.
If the user's message includes a [SPEAKER*] tag, do not read out the tag and generate speech for the following text, using the specified voice.
If no speaker tag is present, select a suitable voice on your own."""
user_message = """[SPEAKER0] I can't believe you did that without even asking me first!
[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.
[SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!
[SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act."""
conversation2 = [
{
"role": "system",
"content": [
{
"type": "text",
"text": system_message
}
]
},
{
"role": "scene",
"content": [
{
"type": "text",
"text": "Audio is recorded from a quiet room."
},
{
"type": "text",
"text": "SPEAKER0: feminine"
},
{
"type": "text",
"text": "SPEAKER1: masculine"
},
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": user_message
}
]
}
]
inputs2 = processor.apply_chat_template(conversation2, add_generation_prompt=True)
test(inputs2, expected_multi_speaker_with_smart_voice, "Multi speaker with smart voice")
# ============================
# ============================
conversation3 = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "Generate audio following instruction."
}
]
},
{
"role": "scene",
"content": [
{
"type": "text",
"text": "Audio is recorded from a quiet room."
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year."
}
]
},
{
"role": "assistant",
"content": [
{
"type": "audio",
"audio_url": "/Users/eustachelebihan/dev/add-higgs-v2/voice-prompts/belinda.wav"
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
}
]
}
]
inputs3 = processor.apply_chat_template(conversation3, add_generation_prompt=True)
test(inputs3, expected_zero_shot_voice_cloning, "Zero shot voice cloning")
# ============================
conversation4 = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "Generate audio following instruction."
}
]
},
{
"role": "scene",
"content": [
{
"type": "text",
"text": "Audio is recorded from a quiet room."
},
{
"type": "text",
"text": "SPEAKER0:"
},
{
"type": "audio",
"audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"
},
{
"type": "text",
"text": "SPEAKER1:"
},
{
"type": "audio",
"audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"
},
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "[SPEAKER0] I can't believe you did that without even asking me first!"
}
]
},
{
"role": "assistant",
"content": [
{
"type": "audio",
"audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this."
}
]
}
]
inputs4 = processor.apply_chat_template(conversation4, add_generation_prompt=True)
test(inputs4, expected_multi_speaker_with_voice_clone, "Multi speaker with voice clone")
# ============================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment