Created
November 4, 2025 08:29
-
-
Save eustlb/d7e55b17dc089511ae9cf826eaf3b570 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from transformers import AutoProcessor | |
| from transformers.models.mllama.image_processing_mllama import convert_aspect_ratios_to_ids | |
| # Load the chat template from file | |
| with open("/Users/eustachelebihan/dev/add-higgs-v2/tmp/chat_template.jinja", "r") as f: | |
| chat_template = f.read() | |
| # Load expected outputs for comparison | |
| with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/single_speaker_with_smart_voice.txt", "r") as f: | |
| expected_single_speaker_with_smart_voice = f.read() | |
| with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/multi_speaker_with_smart_voice.txt", "r") as f: | |
| expected_multi_speaker_with_smart_voice = f.read() | |
| with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/zero_shot_voice_cloning.txt", "r") as f: | |
| expected_zero_shot_voice_cloning = f.read() | |
| with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/multi_speaker_with_voice_clone.txt", "r") as f: | |
| expected_multi_speaker_with_voice_clone = f.read() | |
| def test(inputs, expected, test_name): | |
| try: | |
| assert inputs == expected, f"{test_name} failed" | |
| except AssertionError: | |
| print(inputs) | |
| raise | |
| print(f"{test_name} passed") | |
| processor = AutoProcessor.from_pretrained("./tmp-proc") | |
| processor.chat_template = chat_template | |
| # ============================ | |
| conversation1 = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Generate audio following instruction." | |
| } | |
| ], | |
| }, | |
| { | |
| "role": "scene", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Audio is recorded from a quiet room." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years." | |
| } | |
| ] | |
| inputs1 = processor.apply_chat_template(conversation1, add_generation_prompt=True) | |
| test(inputs1, expected_single_speaker_with_smart_voice, "Single speaker with smart voice") | |
| # ============================ | |
| # ============================ | |
| system_message = """You are an AI assistant designed to convert text into speech. | |
| If the user's message includes a [SPEAKER*] tag, do not read out the tag and generate speech for the following text, using the specified voice. | |
| If no speaker tag is present, select a suitable voice on your own.""" | |
| user_message = """[SPEAKER0] I can't believe you did that without even asking me first! | |
| [SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this. | |
| [SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion! | |
| [SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act.""" | |
| conversation2 = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": system_message | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "scene", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Audio is recorded from a quiet room." | |
| }, | |
| { | |
| "type": "text", | |
| "text": "SPEAKER0: feminine" | |
| }, | |
| { | |
| "type": "text", | |
| "text": "SPEAKER1: masculine" | |
| }, | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": user_message | |
| } | |
| ] | |
| } | |
| ] | |
| inputs2 = processor.apply_chat_template(conversation2, add_generation_prompt=True) | |
| test(inputs2, expected_multi_speaker_with_smart_voice, "Multi speaker with smart voice") | |
| # ============================ | |
| # ============================ | |
| conversation3 = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Generate audio following instruction." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "scene", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Audio is recorded from a quiet room." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": [ | |
| { | |
| "type": "audio", | |
| "audio_url": "/Users/eustachelebihan/dev/add-higgs-v2/voice-prompts/belinda.wav" | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years." | |
| } | |
| ] | |
| } | |
| ] | |
| inputs3 = processor.apply_chat_template(conversation3, add_generation_prompt=True) | |
| test(inputs3, expected_zero_shot_voice_cloning, "Zero shot voice cloning") | |
| # ============================ | |
| conversation4 = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Generate audio following instruction." | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "scene", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Audio is recorded from a quiet room." | |
| }, | |
| { | |
| "type": "text", | |
| "text": "SPEAKER0:" | |
| }, | |
| { | |
| "type": "audio", | |
| "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav" | |
| }, | |
| { | |
| "type": "text", | |
| "text": "SPEAKER1:" | |
| }, | |
| { | |
| "type": "audio", | |
| "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac" | |
| }, | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "[SPEAKER0] I can't believe you did that without even asking me first!" | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": [ | |
| { | |
| "type": "audio", | |
| "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav" | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this." | |
| } | |
| ] | |
| } | |
| ] | |
| inputs4 = processor.apply_chat_template(conversation4, add_generation_prompt=True) | |
| test(inputs4, expected_multi_speaker_with_voice_clone, "Multi speaker with voice clone") | |
| # ============================ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment