Created
April 23, 2025 22:48
-
-
Save kwindla/75c5772dd39ab0806329f53996015e2b to your computer and use it in GitHub Desktop.
OpenAI voice model detective story
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
from openai import AsyncOpenAI | |
from openai.helpers import LocalAudioPlayer | |
import wave | |
import numpy as np | |
openai = AsyncOpenAI() | |
narrator_instructions = """ | |
The speaker is narrating a detective story. | |
Voice Affect: Quietly dramatic. | |
Tone: Erudite. Learned. British accent. | |
Pacing: Slow and steady. | |
Emotion: Steady. | |
Pronunciation: Clear and precise.\n\nPauses: Brief pauses for narrative impact. | |
""" | |
def character_instructions(append): | |
return f"""An American woman. A character in a detective story. | |
Speaking quickly. Trying not to be overheard. | |
Tone: {append}""" | |
narrator = [ | |
( | |
"It was a dark and stormy night. The detective crept over to the window. A woman was talking to someone near the rosebeds.", | |
narrator_instructions, | |
), | |
( | |
"Just then, the clock struck midnight. The woman turned, startled, and seemed to look right into the darkened room at the detective.", | |
narrator_instructions, | |
), | |
("... she said, and then hurried around the corner.", narrator_instructions), | |
] | |
character = [ | |
("shhh, we need to be quiet ...", character_instructions("Whispering.")), | |
("I don't think we have a chance of getting away with it.", character_instructions("Fearful.")), | |
("Is there someone there?", character_instructions("Startled.")), | |
] | |
async def main() -> None: | |
player = LocalAudioPlayer() | |
all_segments: list[np.ndarray] = [] # store audio buffers for wav output | |
# Iterate over character and narrator tuples simultaneously | |
for char_line, narr_line in zip(character, narrator): | |
# Play the character line first | |
char_input, char_instructions = char_line | |
async with openai.audio.speech.with_streaming_response.create( | |
model="gpt-4o-mini-tts", | |
voice="alloy", # character voice | |
input=char_input, | |
instructions=char_instructions, | |
response_format="pcm", | |
) as response: | |
# Convert streamed response to numpy buffer once | |
buffer = await player._tts_response_to_buffer(response) | |
await player.play(buffer) | |
all_segments.append(buffer) | |
# Then play the narrator line | |
narr_input, narr_instructions = narr_line | |
async with openai.audio.speech.with_streaming_response.create( | |
model="gpt-4o-mini-tts", | |
voice="ballad", # narrator voice | |
input=narr_input, | |
instructions=narr_instructions, | |
response_format="pcm", | |
) as response: | |
buffer = await player._tts_response_to_buffer(response) | |
await player.play(buffer) | |
all_segments.append(buffer) | |
# After all segments are played, save concatenated output to WAV | |
if all_segments: | |
combined = np.concatenate(all_segments, axis=0).flatten() | |
int16_samples = (combined * 32767.0).astype(np.int16) | |
with wave.open("output.wav", "wb") as wf: | |
wf.setnchannels(1) # mono | |
wf.setsampwidth(2) # 16-bit | |
wf.setframerate(24000) # match SAMPLE_RATE used by LocalAudioPlayer | |
wf.writeframes(int16_samples.tobytes()) | |
if __name__ == "__main__": | |
asyncio.run(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment