eustlb · November 4, 2025 08:29
diff --git a/test_inputs_higgs.py b/test_inputs_higgs.py
 from transformers import AutoProcessor
 from transformers.models.mllama.image_processing_mllama import convert_aspect_ratios_to_ids

 # Load the chat template from file
 with open("/Users/eustachelebihan/dev/add-higgs-v2/tmp/chat_template.jinja", "r") as f:
    chat_template = f.read()

 # Load expected outputs for comparison
 with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/single_speaker_with_smart_voice.txt", "r") as f:
    expected_single_speaker_with_smart_voice = f.read()

 with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/multi_speaker_with_smart_voice.txt", "r") as f:
    expected_multi_speaker_with_smart_voice = f.read()

 with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/zero_shot_voice_cloning.txt", "r") as f:
    expected_zero_shot_voice_cloning = f.read()

 with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/multi_speaker_with_voice_clone.txt", "r") as f:
    expected_multi_speaker_with_voice_clone = f.read()


 def test(inputs, expected, test_name):
    try:
        assert inputs == expected, f"{test_name} failed"
    except AssertionError:
        print(inputs)
        raise
    print(f"{test_name} passed")
    

 processor = AutoProcessor.from_pretrained("./tmp-proc")
 processor.chat_template = chat_template

 # ============================
 conversation1 = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": "Generate audio following instruction."
            }
        ],
    },
    {
        "role": "scene",
        "content": [
            {
                "type": "text",
                "text": "Audio is recorded from a quiet room."
            }
        ]
    },
    {
        "role": "user",
        "content": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
    }
 ]

 inputs1 = processor.apply_chat_template(conversation1, add_generation_prompt=True)
 test(inputs1, expected_single_speaker_with_smart_voice, "Single speaker with smart voice")
 # ============================

 # ============================
 system_message = """You are an AI assistant designed to convert text into speech.
 If the user's message includes a [SPEAKER*] tag, do not read out the tag and generate speech for the following text, using the specified voice.
 If no speaker tag is present, select a suitable voice on your own."""

 user_message = """[SPEAKER0] I can't believe you did that without even asking me first!
 [SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.
 [SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!
 [SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act."""

 conversation2 = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": system_message
            }
        ]
    },
    {
        "role": "scene",
        "content": [
            {
                "type": "text",
                "text": "Audio is recorded from a quiet room."
            },
            {
                "type": "text",
                "text": "SPEAKER0: feminine"
            },
            {
                "type": "text",
                "text": "SPEAKER1: masculine"
            },
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": user_message
            }
        ]
    }
 ]

 inputs2 = processor.apply_chat_template(conversation2, add_generation_prompt=True)
 test(inputs2, expected_multi_speaker_with_smart_voice, "Multi speaker with smart voice")
 # ============================

 # ============================
 conversation3 = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": "Generate audio following instruction."
            }
        ]
    },
    {
        "role": "scene",
        "content": [
            {
                "type": "text",
                "text": "Audio is recorded from a quiet room."
            }
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year."
            }
        ]
    },
    {
        "role": "assistant",
        "content": [
            {
                "type": "audio",
                "audio_url": "/Users/eustachelebihan/dev/add-higgs-v2/voice-prompts/belinda.wav"
            }
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
            }
        ]
    }
 ]

 inputs3 = processor.apply_chat_template(conversation3, add_generation_prompt=True)
 test(inputs3, expected_zero_shot_voice_cloning, "Zero shot voice cloning")

 # ============================
 conversation4 = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": "Generate audio following instruction."
            }
        ]
    },
    {
        "role": "scene",
        "content": [
            {
                "type": "text",
                "text": "Audio is recorded from a quiet room."
            },
            {
                "type": "text",
                "text": "SPEAKER0:"
            },
            {
                "type": "audio",
                "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"
            },
            {
                "type": "text",
                "text": "SPEAKER1:"
            },
            {
                "type": "audio",
                "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"
            },
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "[SPEAKER0] I can't believe you did that without even asking me first!"
            }
        ]
    },
    {
        "role": "assistant",
        "content": [
            {
                "type": "audio",
                "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"
            }
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this."
            }
        ]
    }
 ]

 inputs4 = processor.apply_chat_template(conversation4, add_generation_prompt=True)
 test(inputs4, expected_multi_speaker_with_voice_clone, "Multi speaker with voice clone")
 # ============================
	from transformers import AutoProcessor
	from transformers.models.mllama.image_processing_mllama import convert_aspect_ratios_to_ids

	# Load the chat template from file
	with open("/Users/eustachelebihan/dev/add-higgs-v2/tmp/chat_template.jinja", "r") as f:
	chat_template = f.read()

	# Load expected outputs for comparison
	with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/single_speaker_with_smart_voice.txt", "r") as f:
	expected_single_speaker_with_smart_voice = f.read()

	with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/multi_speaker_with_smart_voice.txt", "r") as f:
	expected_multi_speaker_with_smart_voice = f.read()

	with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/zero_shot_voice_cloning.txt", "r") as f:
	expected_zero_shot_voice_cloning = f.read()

	with open("/Users/eustachelebihan/dev/add-higgs-v2/expected/multi_speaker_with_voice_clone.txt", "r") as f:
	expected_multi_speaker_with_voice_clone = f.read()


	def test(inputs, expected, test_name):
	try:
	assert inputs == expected, f"{test_name} failed"
	except AssertionError:
	print(inputs)
	raise
	print(f"{test_name} passed")


	processor = AutoProcessor.from_pretrained("./tmp-proc")
	processor.chat_template = chat_template

	# ============================
	conversation1 = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "Generate audio following instruction."
	}
	],
	},
	{
	"role": "scene",
	"content": [
	{
	"type": "text",
	"text": "Audio is recorded from a quiet room."
	}
	]
	},
	{
	"role": "user",
	"content": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
	}
	]

	inputs1 = processor.apply_chat_template(conversation1, add_generation_prompt=True)
	test(inputs1, expected_single_speaker_with_smart_voice, "Single speaker with smart voice")
	# ============================

	# ============================
	system_message = """You are an AI assistant designed to convert text into speech.
	If the user's message includes a [SPEAKER*] tag, do not read out the tag and generate speech for the following text, using the specified voice.
	If no speaker tag is present, select a suitable voice on your own."""

	user_message = """[SPEAKER0] I can't believe you did that without even asking me first!
	[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.
	[SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!
	[SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act."""

	conversation2 = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": system_message
	}
	]
	},
	{
	"role": "scene",
	"content": [
	{
	"type": "text",
	"text": "Audio is recorded from a quiet room."
	},
	{
	"type": "text",
	"text": "SPEAKER0: feminine"
	},
	{
	"type": "text",
	"text": "SPEAKER1: masculine"
	},
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": user_message
	}
	]
	}
	]

	inputs2 = processor.apply_chat_template(conversation2, add_generation_prompt=True)
	test(inputs2, expected_multi_speaker_with_smart_voice, "Multi speaker with smart voice")
	# ============================

	# ============================
	conversation3 = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "Generate audio following instruction."
	}
	]
	},
	{
	"role": "scene",
	"content": [
	{
	"type": "text",
	"text": "Audio is recorded from a quiet room."
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year."
	}
	]
	},
	{
	"role": "assistant",
	"content": [
	{
	"type": "audio",
	"audio_url": "/Users/eustachelebihan/dev/add-higgs-v2/voice-prompts/belinda.wav"
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
	}
	]
	}
	]

	inputs3 = processor.apply_chat_template(conversation3, add_generation_prompt=True)
	test(inputs3, expected_zero_shot_voice_cloning, "Zero shot voice cloning")

	# ============================
	conversation4 = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "Generate audio following instruction."
	}
	]
	},
	{
	"role": "scene",
	"content": [
	{
	"type": "text",
	"text": "Audio is recorded from a quiet room."
	},
	{
	"type": "text",
	"text": "SPEAKER0:"
	},
	{
	"type": "audio",
	"audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"
	},
	{
	"type": "text",
	"text": "SPEAKER1:"
	},
	{
	"type": "audio",
	"audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"
	},
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "[SPEAKER0] I can't believe you did that without even asking me first!"
	}
	]
	},
	{
	"role": "assistant",
	"content": [
	{
	"type": "audio",
	"audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this."
	}
	]
	}
	]

	inputs4 = processor.apply_chat_template(conversation4, add_generation_prompt=True)
	test(inputs4, expected_multi_speaker_with_voice_clone, "Multi speaker with voice clone")
	# ============================
No results found