eustlb · November 4, 2025 18:05
diff --git a/test_higgs_audio.py b/test_higgs_audio.py
 from transformers import AutoProcessor, HiggsAudioForConditionalGeneration

 model_id = "eustlb/higgs-v2"
 processor = AutoProcessor.from_pretrained(model_id, device_map="cuda")
 processor.tokenizer.pad_token = processor.tokenizer.eos_token
 model = HiggsAudioForConditionalGeneration.from_pretrained(model_id, device_map="cuda")

 # single speaker smart voice
 conversation = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": "Generate audio following instruction."
            }
        ],
    },
    {
        "role": "scene",
        "content": [
            {
                "type": "text",
                "text": "Audio is recorded from a quiet room."
            }
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
            }
        ]
    }
 ]

 inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
 inputs = inputs.to(model.device)
 outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
 outputs = processor.batch_decode(outputs)
 processor.save_audio(outputs, "output_single_speaker_smart_voice.wav")


 # multi speaker smart voice
 system_message = """You are an AI assistant designed to convert text into speech.
 If the user's message includes a [SPEAKER*] tag, do not read out the tag and generate speech for the following text, using the specified voice.
 If no speaker tag is present, select a suitable voice on your own."""

 user_message = """[SPEAKER0] I can't believe you did that without even asking me first!
 [SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.
 [SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!
 [SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act."""

 conversation = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": system_message
            }
        ]
    },
    {
        "role": "scene",
        "content": [
            {
                "type": "text",
                "text": "Audio is recorded from a quiet room."
            },
            {
                "type": "text",
                "text": "SPEAKER0: feminine"
            },
            {
                "type": "text",
                "text": "SPEAKER1: masculine"
            },
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": user_message
            }
        ]
    }
 ]

 inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
 inputs = inputs.to(model.device)
 outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
 outputs = processor.batch_decode(outputs)
 processor.save_audio(outputs, "output_multi_speaker_smart_voice.wav")


 # zero shot voice cloning
 conversation = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": "Generate audio following instruction."
            }
        ]
    },
    {
        "role": "scene",
        "content": [
            {
                "type": "text",
                "text": "Audio is recorded from a quiet room."
            }
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year."
            }
        ]
    },
    {
        "role": "assistant",
        "content": [
            {
                "type": "audio",
                "url": "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/belinda.wav"
            }
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
            }
        ]
    }
 ]

 inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
 inputs = inputs.to(model.device)
 outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
 outputs = processor.batch_decode(outputs)
 processor.save_audio(outputs, "output_zero_shot_voice_cloning.wav")


 # multi speaker voice cloning
 conversation = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": "Generate audio following instruction."
            }
        ]
    },
    {
        "role": "scene",
        "content": [
            {
                "type": "text",
                "text": "Audio is recorded from a quiet room."
            },
            {
                "type": "text",
                "text": "SPEAKER0:"
            },
            {
                "type": "audio",
                "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"
            },
            {
                "type": "text",
                "text": "SPEAKER1:"
            },
            {
                "type": "audio",
                "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"
            },
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "[SPEAKER0] I can't believe you did that without even asking me first!"
            }
        ]
    },
 ]

 inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
 inputs = inputs.to(model.device)
 outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
 outputs = processor.batch_decode(outputs)
 processor.save_audio(outputs, "output_multi_speaker_voice_cloning.wav")


 # batched inference
 conversation1 = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": "Generate audio following instruction."
            }
        ]
    },
    {
        "role": "scene",
        "content": [
            {
                "type": "text",
                "text": "Audio is recorded from a quiet room."
            }
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year."
            }
        ]
    },
    {
        "role": "assistant",
        "content": [
            {
                "type": "audio",
                "url": "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/belinda.wav"
            }
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
            }
        ]
    }
 ]

 conversation2 = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": "Generate audio following instruction."
            }
        ]
    },
    {
        "role": "scene",
        "content": [
            {
                "type": "text",
                "text": "Audio is recorded from a quiet room."
            }
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": " It's super important to assess fairly the fact that our former model is over. And this is not a question of adjustment. This is not the same world, 2024, 2025. And on top of that, we are making the same mistakes, on top of the key elements I mentioned. We are over-regulating and under-investing. So just if, in the two to three years to come, if we follow our classical agenda, we will be out of the market. I have no doubts."
            }
        ]
    },
    {
        "role": "assistant",
        "content": [
            {
                "type": "audio",
                "url": "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/macron.wav"
            }
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "Hey, here is a clone from the given voice."
            }
        ]
    }
 ]

 inputs = processor.apply_chat_template([conversation1, conversation2], add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
 inputs = inputs.to(model.device)
 outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
 outputs = processor.batch_decode(outputs)
 processor.save_audio(outputs, ["output_batched_1.wav", "output_batched_2.wav"])
	from transformers import AutoProcessor, HiggsAudioForConditionalGeneration

	model_id = "eustlb/higgs-v2"
	processor = AutoProcessor.from_pretrained(model_id, device_map="cuda")
	processor.tokenizer.pad_token = processor.tokenizer.eos_token
	model = HiggsAudioForConditionalGeneration.from_pretrained(model_id, device_map="cuda")

	# single speaker smart voice
	conversation = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "Generate audio following instruction."
	}
	],
	},
	{
	"role": "scene",
	"content": [
	{
	"type": "text",
	"text": "Audio is recorded from a quiet room."
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
	}
	]
	}
	]

	inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
	inputs = inputs.to(model.device)
	outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
	outputs = processor.batch_decode(outputs)
	processor.save_audio(outputs, "output_single_speaker_smart_voice.wav")


	# multi speaker smart voice
	system_message = """You are an AI assistant designed to convert text into speech.
	If the user's message includes a [SPEAKER*] tag, do not read out the tag and generate speech for the following text, using the specified voice.
	If no speaker tag is present, select a suitable voice on your own."""

	user_message = """[SPEAKER0] I can't believe you did that without even asking me first!
	[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.
	[SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!
	[SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act."""

	conversation = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": system_message
	}
	]
	},
	{
	"role": "scene",
	"content": [
	{
	"type": "text",
	"text": "Audio is recorded from a quiet room."
	},
	{
	"type": "text",
	"text": "SPEAKER0: feminine"
	},
	{
	"type": "text",
	"text": "SPEAKER1: masculine"
	},
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": user_message
	}
	]
	}
	]

	inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
	inputs = inputs.to(model.device)
	outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
	outputs = processor.batch_decode(outputs)
	processor.save_audio(outputs, "output_multi_speaker_smart_voice.wav")


	# zero shot voice cloning
	conversation = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "Generate audio following instruction."
	}
	]
	},
	{
	"role": "scene",
	"content": [
	{
	"type": "text",
	"text": "Audio is recorded from a quiet room."
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year."
	}
	]
	},
	{
	"role": "assistant",
	"content": [
	{
	"type": "audio",
	"url": "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/belinda.wav"
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
	}
	]
	}
	]

	inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
	inputs = inputs.to(model.device)
	outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
	outputs = processor.batch_decode(outputs)
	processor.save_audio(outputs, "output_zero_shot_voice_cloning.wav")


	# multi speaker voice cloning
	conversation = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "Generate audio following instruction."
	}
	]
	},
	{
	"role": "scene",
	"content": [
	{
	"type": "text",
	"text": "Audio is recorded from a quiet room."
	},
	{
	"type": "text",
	"text": "SPEAKER0:"
	},
	{
	"type": "audio",
	"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"
	},
	{
	"type": "text",
	"text": "SPEAKER1:"
	},
	{
	"type": "audio",
	"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"
	},
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "[SPEAKER0] I can't believe you did that without even asking me first!"
	}
	]
	},
	]

	inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
	inputs = inputs.to(model.device)
	outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
	outputs = processor.batch_decode(outputs)
	processor.save_audio(outputs, "output_multi_speaker_voice_cloning.wav")


	# batched inference
	conversation1 = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "Generate audio following instruction."
	}
	]
	},
	{
	"role": "scene",
	"content": [
	{
	"type": "text",
	"text": "Audio is recorded from a quiet room."
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year."
	}
	]
	},
	{
	"role": "assistant",
	"content": [
	{
	"type": "audio",
	"url": "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/belinda.wav"
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
	}
	]
	}
	]

	conversation2 = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "Generate audio following instruction."
	}
	]
	},
	{
	"role": "scene",
	"content": [
	{
	"type": "text",
	"text": "Audio is recorded from a quiet room."
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": " It's super important to assess fairly the fact that our former model is over. And this is not a question of adjustment. This is not the same world, 2024, 2025. And on top of that, we are making the same mistakes, on top of the key elements I mentioned. We are over-regulating and under-investing. So just if, in the two to three years to come, if we follow our classical agenda, we will be out of the market. I have no doubts."
	}
	]
	},
	{
	"role": "assistant",
	"content": [
	{
	"type": "audio",
	"url": "https://huggingface.co/datasets/eustlb/dummy-audio-samples-higgs/resolve/main/macron.wav"
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "Hey, here is a clone from the given voice."
	}
	]
	}
	]

	inputs = processor.apply_chat_template([conversation1, conversation2], add_generation_prompt=True, tokenize=True, return_dict=True, sampling_rate=24000)
	inputs = inputs.to(model.device)
	outputs = model.generate(**inputs, max_new_tokens=10000, do_sample=False)
	outputs = processor.batch_decode(outputs)
	processor.save_audio(outputs, ["output_batched_1.wav", "output_batched_2.wav"])
No results found