Skip to content

Instantly share code, notes, and snippets.

@kwindla
Created April 23, 2025 22:48
Show Gist options
  • Save kwindla/75c5772dd39ab0806329f53996015e2b to your computer and use it in GitHub Desktop.
Save kwindla/75c5772dd39ab0806329f53996015e2b to your computer and use it in GitHub Desktop.
OpenAI voice model detective story
import asyncio
from openai import AsyncOpenAI
from openai.helpers import LocalAudioPlayer
import wave
import numpy as np
openai = AsyncOpenAI()
narrator_instructions = """
The speaker is narrating a detective story.
Voice Affect: Quietly dramatic.
Tone: Erudite. Learned. British accent.
Pacing: Slow and steady.
Emotion: Steady.
Pronunciation: Clear and precise.\n\nPauses: Brief pauses for narrative impact.
"""
def character_instructions(append):
return f"""An American woman. A character in a detective story.
Speaking quickly. Trying not to be overheard.
Tone: {append}"""
narrator = [
(
"It was a dark and stormy night. The detective crept over to the window. A woman was talking to someone near the rosebeds.",
narrator_instructions,
),
(
"Just then, the clock struck midnight. The woman turned, startled, and seemed to look right into the darkened room at the detective.",
narrator_instructions,
),
("... she said, and then hurried around the corner.", narrator_instructions),
]
character = [
("shhh, we need to be quiet ...", character_instructions("Whispering.")),
("I don't think we have a chance of getting away with it.", character_instructions("Fearful.")),
("Is there someone there?", character_instructions("Startled.")),
]
async def main() -> None:
player = LocalAudioPlayer()
all_segments: list[np.ndarray] = [] # store audio buffers for wav output
# Iterate over character and narrator tuples simultaneously
for char_line, narr_line in zip(character, narrator):
# Play the character line first
char_input, char_instructions = char_line
async with openai.audio.speech.with_streaming_response.create(
model="gpt-4o-mini-tts",
voice="alloy", # character voice
input=char_input,
instructions=char_instructions,
response_format="pcm",
) as response:
# Convert streamed response to numpy buffer once
buffer = await player._tts_response_to_buffer(response)
await player.play(buffer)
all_segments.append(buffer)
# Then play the narrator line
narr_input, narr_instructions = narr_line
async with openai.audio.speech.with_streaming_response.create(
model="gpt-4o-mini-tts",
voice="ballad", # narrator voice
input=narr_input,
instructions=narr_instructions,
response_format="pcm",
) as response:
buffer = await player._tts_response_to_buffer(response)
await player.play(buffer)
all_segments.append(buffer)
# After all segments are played, save concatenated output to WAV
if all_segments:
combined = np.concatenate(all_segments, axis=0).flatten()
int16_samples = (combined * 32767.0).astype(np.int16)
with wave.open("output.wav", "wb") as wf:
wf.setnchannels(1) # mono
wf.setsampwidth(2) # 16-bit
wf.setframerate(24000) # match SAMPLE_RATE used by LocalAudioPlayer
wf.writeframes(int16_samples.tobytes())
if __name__ == "__main__":
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment