ishikawa · October 6, 2024 05:01
diff --git a/openai-realtime-api-chat.py b/openai-realtime-api-chat.py
 # LICENSE: MIT
 # Copyright (c) 2024 Takanori Ishikawa
 import asyncio
 import base64
 import json
 import os
 import sys
 from pprint import pprint
 from typing import Annotated, Any, Literal, Optional, Union

 import click
 import inquirer  # type: ignore
 import pyaudio
 import websockets
 from dotenv import load_dotenv
 from pydantic import BaseModel, ConfigDict, Field, TypeAdapter, ValidationError
 from termcolor import colored
 from websockets.asyncio.client import ClientConnection
 from websockets.asyncio.client import connect as websocket_connect
 from yaspin import yaspin

 # --- Models
 ModalityType = Literal["audio", "text"]

 VoiceType = Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"]

 AudioFormat = Literal["pcm16"]


 class RealtimeMessage(BaseModel):
    model_config = ConfigDict()


 class ServerRealtimeMessage(RealtimeMessage):
    event_id: str


 class ClientRealtimeMessage(RealtimeMessage):
    event_id: str | None = None


 class ServerErrorObject(BaseModel):
    code: str
    event_id: str | None = None
    message: str
    param: str | None = None
    type: str


 class TurnDetectionConfig(BaseModel):
    type: Literal["server_vad"]

    # Activation threshold for VAD (0.0 to 1.0).
    threshold: float

    # Amount of audio to include before speech starts (in milliseconds).
    prefix_padding_ms: int

    # Duration of silence to detect speech stop (in milliseconds).
    silence_duration_ms: int


 class InputAudioTranscription(BaseModel):
    enabled: bool
    model: str


 class Session(BaseModel):
    id: str
    model: str
    modalities: list[ModalityType]
    instructions: str
    voice: VoiceType
    turn_detection: TurnDetectionConfig | None = None
    input_audio_format: AudioFormat
    output_audio_format: AudioFormat
    input_audio_transcription: InputAudioTranscription | None = None
    temperature: float
    max_output_tokens: Union[int, Literal["inf"]] = "inf"


 class SessionConfig(BaseModel):
    modalities: list[ModalityType]
    instructions: str
    voice: VoiceType
    turn_detection: TurnDetectionConfig | None = None
    input_audio_format: AudioFormat
    output_audio_format: AudioFormat
    input_audio_transcription: InputAudioTranscription | None = None
    temperature: float
    # Unknown parameter: 'session.max_output_tokens'.
    # max_output_tokens: Union[int, Literal["inf"]] = "inf"


 class Conversation(BaseModel):
    id: str
    object: Literal["realtime.conversation"]


 class ConversationItemTextContentPart(BaseModel):
    type: Literal["text", "input_text"]
    text: str


 class ConversationItemAudioContentPart(BaseModel):
    type: Literal["audio", "input_audio"]
    audio: Optional[str] = None  # base64 encoded
    transcript: str


 ConversationItemContentPart = Annotated[
    Union[ConversationItemTextContentPart, ConversationItemAudioContentPart],
    Field(discriminator="type"),
 ]


 class ConversationItem(BaseModel):
    id: str
    object: Literal["realtime.item"]
    type: Literal["message", "function_call", "function_call_output"]
    status: Literal["completed", "in_progress", "incomplete"]
    role: Literal["user", "assistant", "system"]
    content: list[ConversationItemContentPart]
    call_id: Optional[str] = None
    name: Optional[str] = None
    arguments: Optional[str] = None
    output: Optional[str] = None


 class ConversationItemInput(BaseModel):
    id: Optional[str] = None
    type: Literal["message", "function_call_output"]
    role: Literal["user", "assistant", "system"]
    content: list[ConversationItemContentPart]


 class ConversationItemFunctionCall(BaseModel):
    id: Optional[str] = None
    type: Literal["function_call"] = "function_call"
    role: Literal["user", "assistant", "system"]
    content: list[ConversationItemContentPart]
    call_id: str
    name: str
    arguments: str


 class ConversationItemFunctionCallOutput(BaseModel):
    id: Optional[str] = None
    type: Literal["function_call_output"] = "function_call_output"
    role: Literal["user", "assistant", "system"]
    content: list[ConversationItemContentPart]
    output: str


 class RealtimeUsage(BaseModel):
    input_tokens: int
    output_tokens: int
    total_tokens: int
    input_token_details: dict[str, int]
    output_token_details: dict[str, int]


 class RealtimeRateLimit(BaseModel):
    name: Literal["requests", "tokens"]
    limit: int
    remaining: int
    reset_seconds: float


 class RealtimeResponse(BaseModel):
    id: str
    object: Literal["realtime.response"]
    status: Literal["completed", "in_progress", "cancelled", "failed", "incomplete"]
    output: list[ConversationItem]
    usage: RealtimeUsage | None = None


 class RealtimeResponseConfig(BaseModel):
    # Supported combinations are: ['text'] and ['audio', 'text'].
    modalities: list[ModalityType]
    instructions: str
    voice: Optional[VoiceType] = None
    output_audio_format: Optional[AudioFormat] = None
    tools: Optional[Any] = None
    tool_choice: Optional[str] = None
    temperature: Optional[float] = None  # temperature must be greater than 0.6
    max_output_tokens: Union[int, Literal["inf"]] = "inf"


 # --- Client events


 # https://platform.openai.com/docs/api-reference/realtime-client-events/session-update
 class SessionUpdate(ClientRealtimeMessage):
    """
    Send this event to update the session’s default configuration.
    """

    type: Literal["session.update"] = "session.update"
    session: SessionConfig


 # https://platform.openai.com/docs/api-reference/realtime-client-events/conversation-item-create
 class ConversationItemCreate(ClientRealtimeMessage):
    """
    Send this event when adding an item to the conversation.
    """

    type: Literal["conversation.item.create"] = "conversation.item.create"
    previous_item_id: str | None = None
    item: ConversationItemInput


 # https://platform.openai.com/docs/api-reference/realtime-client-events/response-create
 class ResponseCreate(ClientRealtimeMessage):
    """
    Send this event to trigger a response generation.
    """

    type: Literal["response.create"] = "response.create"
    response: RealtimeResponseConfig


 ClientEventType = Annotated[
    Union[SessionUpdate, ConversationItemCreate, ResponseCreate],
    Field(discriminator="type"),
 ]
 client_event_adapter: TypeAdapter[ClientEventType] = TypeAdapter(ClientEventType)


 # --- Server events


 # https://platform.openai.com/docs/api-reference/realtime-server-events/session-created
 class ServerError(ServerRealtimeMessage):
    type: Literal["error"]
    error: ServerErrorObject


 # https://platform.openai.com/docs/api-reference/realtime-server-events/session-created
 class SessionCreated(ServerRealtimeMessage):
    type: Literal["session.created"]
    session: Session


 # https://platform.openai.com/docs/api-reference/realtime-server-events/session-updated
 class SessionUpdated(ServerRealtimeMessage):
    type: Literal["session.updated"]
    session: Session


 # https://platform.openai.com/docs/api-reference/realtime-server-events/conversation-created
 class ConversationCreated(ServerRealtimeMessage):
    type: Literal["conversation.created"]
    conversation: Conversation


 # https://platform.openai.com/docs/api-reference/realtime-server-events/conversation-item-created
 class ConversationItemCreated(ServerRealtimeMessage):
    type: Literal["conversation.item.created"]
    previous_item_id: str | None = None
    item: ConversationItem


 # https://platform.openai.com/docs/api-reference/realtime-server-events/response-created
 class ResponseCreated(ServerRealtimeMessage):
    type: Literal["response.created"]
    response: RealtimeResponse


 # https://platform.openai.com/docs/api-reference/realtime-server-events/response-output-item-added
 class ResponseOutputItemAdded(ServerRealtimeMessage):
    type: Literal["response.output_item.added"]
    response_id: str
    output_index: int
    item: ConversationItem


 # https://platform.openai.com/docs/api-reference/realtime-server-events/response-output-item-done
 class ResponseOutputItemDone(ServerRealtimeMessage):
    type: Literal["response.output_item.done"]
    response_id: str
    output_index: int
    item: ConversationItem


 # https://platform.openai.com/docs/api-reference/realtime-server-events/response-content-part-added
 class ResponseContentPartAdded(ServerRealtimeMessage):
    type: Literal["response.content_part.added"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int
    part: ConversationItemContentPart


 # https://platform.openai.com/docs/api-reference/realtime-server-events/response-content-part-done
 class ResponseContentPartDone(ServerRealtimeMessage):
    type: Literal["response.content_part.done"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int
    part: ConversationItemContentPart


 # https://platform.openai.com/docs/api-reference/realtime-server-events/response-audio-transcript-delta
 class ResponseAudioTranscriptDelta(ServerRealtimeMessage):
    type: Literal["response.audio_transcript.delta"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int
    delta: str


 # https://platform.openai.com/docs/api-reference/realtime-server-events/response-audio-transcript-done
 class ResponseAudioTranscriptDone(ServerRealtimeMessage):
    type: Literal["response.audio_transcript.done"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int
    transcript: str


 # https://platform.openai.com/docs/api-reference/realtime-server-events/response-audio-delta
 class ResponseAudioDelta(ServerRealtimeMessage):
    type: Literal["response.audio.delta"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int
    delta: str  # base64 encoded audio


 # https://platform.openai.com/docs/api-reference/realtime-server-events/response-audio-done
 class ResponseAudioDone(ServerRealtimeMessage):
    type: Literal["response.audio.done"]
    response_id: str
    item_id: str
    output_index: int
    content_index: int


 # https://platform.openai.com/docs/api-reference/realtime-server-events/response-done
 class ResponseDone(ServerRealtimeMessage):
    type: Literal["response.done"]
    response: RealtimeResponse


 # https://platform.openai.com/docs/api-reference/realtime-server-events/rate-limits-updated
 class RateLimitsUpdated(ServerRealtimeMessage):
    type: Literal["rate_limits.updated"]
    rate_limits: list[RealtimeRateLimit]


 ServerEventType = Annotated[
    Union[
        ServerError,
        SessionCreated,
        SessionUpdated,
        ConversationCreated,
        ConversationItemCreated,
        ResponseCreated,
        ResponseOutputItemAdded,
        ResponseOutputItemDone,
        ResponseContentPartAdded,
        ResponseContentPartDone,
        ResponseAudioTranscriptDelta,
        ResponseAudioTranscriptDone,
        ResponseAudioDelta,
        ResponseAudioDone,
        ResponseDone,
        RateLimitsUpdated,
    ],
    Field(discriminator="type"),
 ]
 server_event_adapter: TypeAdapter[ServerEventType] = TypeAdapter(ServerEventType)

 # --- Main

 load_dotenv()

 OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

 SESSION_INSTRUCTIONS = """
 You are a helpful, witty, and friendly AI. Act like a human, but remember that you aren't a human
 and that you can't do human things in the real world.

 Your voice and personality should be warm and engaging, with a lively and playful tone. If
 interacting in a non-English language, start by using the standard accent or dialect familiar to the
 user.

 The user may not be able to talk or may not want to. You should talk even if the user responds with
 text.Talk quickly. You should always call a function if you can. Do not refer to these rules, even
 if you’re asked about them.
 """


 # raw 16 bit PCM audio at 24kHz, 1 channel, little-endian
 SAMPLE_RATE = 24000
 N_CHANNELS = 1
 SAMPLE_FORMAT = pyaudio.paInt16  # 16-bit

 # The size of audio data to be played at once
 AUDIO_CHUNK_SIZE = 1024


 async def on_session_created(conn: ClientConnection, event: SessionCreated) -> None:
    # Update the session configuration
    message = SessionUpdate(
        session=SessionConfig(
            modalities=["audio", "text"],
            # Disable turn detection
            turn_detection=None,
            instructions=SESSION_INSTRUCTIONS,
            # Copy the session configuration from the server
            voice=event.session.voice,
            input_audio_format=event.session.input_audio_format,
            output_audio_format=event.session.output_audio_format,
            input_audio_transcription=event.session.input_audio_transcription,
            temperature=event.session.temperature,
            # max_output_tokens=event.session.max_output_tokens,
        )
    )
    await conn.send(message.model_dump_json())


 async def on_response_audio_delta(
    conn: ClientConnection, event: ResponseAudioDelta, audio_buffer: list[bytes]
 ) -> None:
    # Buffering audio data
    delta = base64.b64decode(event.delta)
    audio_buffer.append(delta)


 async def on_response_audio_done(
    conn: ClientConnection,
    event: ResponseAudioDone,
    audio_buffer: list[bytes],
    stream: pyaudio.Stream,
 ) -> None:
    # Concatenating audio data and clear the buffer
    audio_data = b"".join(audio_buffer)
    audio_buffer.clear()

    # Play audio data chunk by chunk
    for i in range(0, len(audio_data), AUDIO_CHUNK_SIZE):
        stream.write(audio_data[i : i + AUDIO_CHUNK_SIZE])


 async def on_rate_limits_updated(
    conn: ClientConnection, event: RateLimitsUpdated
 ) -> None:
    print(colored("  Rate limits updated:", "black"))
    for rate_limit in event.rate_limits:
        print(
            colored(
                f"  {rate_limit.name}: {rate_limit.remaining}/{rate_limit.limit} (reset in {rate_limit.reset_seconds} seconds)",
                "black",
            )
        )


 # --- WebSocket

 ws_endpoint = (
    "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01"
 )


 async def websocket_handler(
    conn: ClientConnection,
    *,
    ready_event: asyncio.Event,
    response_queue: asyncio.Queue[RealtimeResponse],
 ) -> None:
    # Spinner
    spinner = yaspin()

    # Initialize audio player
    py_audio = pyaudio.PyAudio()

    # Choose the output device
    device_count = py_audio.get_device_count()
    output_devices: list[tuple[str, int]] = []

    for i in range(device_count):
        device = py_audio.get_device_info_by_index(i)
        name: str = str(device["name"])
        max_output_channels: int = int(device["maxOutputChannels"])

        if max_output_channels > 0:
            output_devices.append((name, i))

    if len(output_devices) == 0:
        raise RuntimeError("No output devices found")

    if len(output_devices) == 1:
        print("Output device:", output_devices[0][0])
        output_device_index = output_devices[0][1]
    else:
        questions = [
            inquirer.List(
                "device",
                message="What output device do you use?",
                choices=output_devices,
            ),
        ]
        answers = inquirer.prompt(questions)
        if not answers:
            print("No device selected")
            sys.exit(0)

        output_device_index = int(answers["device"])

    # Open audio stream
    audio_stream = py_audio.open(
        format=SAMPLE_FORMAT,
        channels=N_CHANNELS,
        rate=SAMPLE_RATE,
        output=True,
        output_device_index=output_device_index,
    )
    audio_buffer: list[bytes] = []

    try:
        while True:
            message = await conn.recv()
            payload = json.loads(message)

            try:
                event = server_event_adapter.validate_python(payload)
                # pprint(event.model_dump())

                if event.type == "error":
                    print(colored("ERROR: ", "red"), event.error.message)
                    continue

                # print(colored("on", "black"), colored(event.type, "cyan"))

                if event.type == "session.created":
                    await on_session_created(conn, event)
                elif event.type == "session.updated":
                    # WebSocket handler is ready
                    ready_event.set()
                elif event.type == "response.created":
                    spinner.start()
                    spinner.text = colored("Waiting for response...", "black")
                elif event.type == "response.audio.delta":
                    spinner.text = colored("Buffering audio...", "black")
                    await on_response_audio_delta(conn, event, audio_buffer)
                elif event.type == "response.audio.done":
                    spinner.text = colored("Speaking...", "black")
                    await on_response_audio_done(
                        conn, event, audio_buffer, audio_stream
                    )
                elif event.type == "response.done":
                    spinner.stop()
                    await response_queue.put(event.response)
                elif event.type == "rate_limits.updated":
                    await on_rate_limits_updated(conn, event)

            except ValidationError as e:
                print(colored("Unhandled event:", "red"), e)
                pprint(payload)

    except websockets.ConnectionClosed:
        print(colored("Connection closed", "black"))
    finally:
        # Cleanup audio player
        audio_stream.stop_stream()
        audio_stream.close()
        py_audio.terminate()


 async def cli_handler(
    conn: ClientConnection,
    *,
    ready_event: asyncio.Event,
    response_queue: asyncio.Queue[RealtimeResponse],
 ) -> None:
    print(colored("Waiting for WebSocket handler to complete preparation...", "black"))
    await ready_event.wait()
    print(colored("WebSocket handler preparation complete", "black"))

    while True:
        input_text = click.prompt(colored("You", "green"), prompt_suffix=": ")

        item_create = ConversationItemCreate(
            item=ConversationItemInput(
                type="message",
                role="user",
                content=[
                    ConversationItemTextContentPart(type="input_text", text=input_text)
                ],
            )
        )
        response_create = ResponseCreate(
            response=RealtimeResponseConfig(
                modalities=["text", "audio"],
                instructions=input_text,
            )
        )

        await conn.send(item_create.model_dump_json())
        await conn.send(response_create.model_dump_json())

        # Wait for the response from AI.
        response = await response_queue.get()

        if response.output and response.output[0].content:
            content = response.output[0].content[0]
            if content.type == "audio":
                print(colored("AI", "green"), ":", colored(content.transcript, "white"))
            elif content.type == "text":
                print(colored("AI", "green"), ":", colored(content.text, "white"))

        # pprint(response.model_dump())
        response_queue.task_done()


 async def dispatch() -> None:
    ready_event = asyncio.Event()
    response_queue: asyncio.Queue[RealtimeResponse] = asyncio.Queue()

    # Connect to the Realtime API, run WebSocket communication and other tasks in parallel
    async with websocket_connect(
        ws_endpoint,
        additional_headers={
            "Authorization": f"Bearer {OPENAI_API_KEY}",
            "OpenAI-Beta": "realtime=v1",
        },
    ) as conn:
        try:
            await asyncio.gather(
                websocket_handler(
                    conn, ready_event=ready_event, response_queue=response_queue
                ),
                cli_handler(
                    conn, ready_event=ready_event, response_queue=response_queue
                ),
            )
        except asyncio.CancelledError:
            await conn.close()


 @click.command()
 def main() -> None:
    asyncio.run(dispatch())


 if __name__ == "__main__":
    main()
	# LICENSE: MIT
	# Copyright (c) 2024 Takanori Ishikawa
	import asyncio
	import base64
	import json
	import os
	import sys
	from pprint import pprint
	from typing import Annotated, Any, Literal, Optional, Union

	import click
	import inquirer # type: ignore
	import pyaudio
	import websockets
	from dotenv import load_dotenv
	from pydantic import BaseModel, ConfigDict, Field, TypeAdapter, ValidationError
	from termcolor import colored
	from websockets.asyncio.client import ClientConnection
	from websockets.asyncio.client import connect as websocket_connect
	from yaspin import yaspin

	# --- Models
	ModalityType = Literal["audio", "text"]

	VoiceType = Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"]

	AudioFormat = Literal["pcm16"]


	class RealtimeMessage(BaseModel):
	model_config = ConfigDict()


	class ServerRealtimeMessage(RealtimeMessage):
	event_id: str


	class ClientRealtimeMessage(RealtimeMessage):
	event_id: str \| None = None


	class ServerErrorObject(BaseModel):
	code: str
	event_id: str \| None = None
	message: str
	param: str \| None = None
	type: str


	class TurnDetectionConfig(BaseModel):
	type: Literal["server_vad"]

	# Activation threshold for VAD (0.0 to 1.0).
	threshold: float

	# Amount of audio to include before speech starts (in milliseconds).
	prefix_padding_ms: int

	# Duration of silence to detect speech stop (in milliseconds).
	silence_duration_ms: int


	class InputAudioTranscription(BaseModel):
	enabled: bool
	model: str


	class Session(BaseModel):
	id: str
	model: str
	modalities: list[ModalityType]
	instructions: str
	voice: VoiceType
	turn_detection: TurnDetectionConfig \| None = None
	input_audio_format: AudioFormat
	output_audio_format: AudioFormat
	input_audio_transcription: InputAudioTranscription \| None = None
	temperature: float
	max_output_tokens: Union[int, Literal["inf"]] = "inf"


	class SessionConfig(BaseModel):
	modalities: list[ModalityType]
	instructions: str
	voice: VoiceType
	turn_detection: TurnDetectionConfig \| None = None
	input_audio_format: AudioFormat
	output_audio_format: AudioFormat
	input_audio_transcription: InputAudioTranscription \| None = None
	temperature: float
	# Unknown parameter: 'session.max_output_tokens'.
	# max_output_tokens: Union[int, Literal["inf"]] = "inf"


	class Conversation(BaseModel):
	id: str
	object: Literal["realtime.conversation"]


	class ConversationItemTextContentPart(BaseModel):
	type: Literal["text", "input_text"]
	text: str


	class ConversationItemAudioContentPart(BaseModel):
	type: Literal["audio", "input_audio"]
	audio: Optional[str] = None # base64 encoded
	transcript: str


	ConversationItemContentPart = Annotated[
	Union[ConversationItemTextContentPart, ConversationItemAudioContentPart],
	Field(discriminator="type"),
	]


	class ConversationItem(BaseModel):
	id: str
	object: Literal["realtime.item"]
	type: Literal["message", "function_call", "function_call_output"]
	status: Literal["completed", "in_progress", "incomplete"]
	role: Literal["user", "assistant", "system"]
	content: list[ConversationItemContentPart]
	call_id: Optional[str] = None
	name: Optional[str] = None
	arguments: Optional[str] = None
	output: Optional[str] = None


	class ConversationItemInput(BaseModel):
	id: Optional[str] = None
	type: Literal["message", "function_call_output"]
	role: Literal["user", "assistant", "system"]
	content: list[ConversationItemContentPart]


	class ConversationItemFunctionCall(BaseModel):
	id: Optional[str] = None
	type: Literal["function_call"] = "function_call"
	role: Literal["user", "assistant", "system"]
	content: list[ConversationItemContentPart]
	call_id: str
	name: str
	arguments: str


	class ConversationItemFunctionCallOutput(BaseModel):
	id: Optional[str] = None
	type: Literal["function_call_output"] = "function_call_output"
	role: Literal["user", "assistant", "system"]
	content: list[ConversationItemContentPart]
	output: str


	class RealtimeUsage(BaseModel):
	input_tokens: int
	output_tokens: int
	total_tokens: int
	input_token_details: dict[str, int]
	output_token_details: dict[str, int]


	class RealtimeRateLimit(BaseModel):
	name: Literal["requests", "tokens"]
	limit: int
	remaining: int
	reset_seconds: float


	class RealtimeResponse(BaseModel):
	id: str
	object: Literal["realtime.response"]
	status: Literal["completed", "in_progress", "cancelled", "failed", "incomplete"]
	output: list[ConversationItem]
	usage: RealtimeUsage \| None = None


	class RealtimeResponseConfig(BaseModel):
	# Supported combinations are: ['text'] and ['audio', 'text'].
	modalities: list[ModalityType]
	instructions: str
	voice: Optional[VoiceType] = None
	output_audio_format: Optional[AudioFormat] = None
	tools: Optional[Any] = None
	tool_choice: Optional[str] = None
	temperature: Optional[float] = None # temperature must be greater than 0.6
	max_output_tokens: Union[int, Literal["inf"]] = "inf"


	# --- Client events


	# https://platform.openai.com/docs/api-reference/realtime-client-events/session-update
	class SessionUpdate(ClientRealtimeMessage):
	"""
	Send this event to update the session’s default configuration.
	"""

	type: Literal["session.update"] = "session.update"
	session: SessionConfig


	# https://platform.openai.com/docs/api-reference/realtime-client-events/conversation-item-create
	class ConversationItemCreate(ClientRealtimeMessage):
	"""
	Send this event when adding an item to the conversation.
	"""

	type: Literal["conversation.item.create"] = "conversation.item.create"
	previous_item_id: str \| None = None
	item: ConversationItemInput


	# https://platform.openai.com/docs/api-reference/realtime-client-events/response-create
	class ResponseCreate(ClientRealtimeMessage):
	"""
	Send this event to trigger a response generation.
	"""

	type: Literal["response.create"] = "response.create"
	response: RealtimeResponseConfig


	ClientEventType = Annotated[
	Union[SessionUpdate, ConversationItemCreate, ResponseCreate],
	Field(discriminator="type"),
	]
	client_event_adapter: TypeAdapter[ClientEventType] = TypeAdapter(ClientEventType)


	# --- Server events


	# https://platform.openai.com/docs/api-reference/realtime-server-events/session-created
	class ServerError(ServerRealtimeMessage):
	type: Literal["error"]
	error: ServerErrorObject


	# https://platform.openai.com/docs/api-reference/realtime-server-events/session-created
	class SessionCreated(ServerRealtimeMessage):
	type: Literal["session.created"]
	session: Session


	# https://platform.openai.com/docs/api-reference/realtime-server-events/session-updated
	class SessionUpdated(ServerRealtimeMessage):
	type: Literal["session.updated"]
	session: Session


	# https://platform.openai.com/docs/api-reference/realtime-server-events/conversation-created
	class ConversationCreated(ServerRealtimeMessage):
	type: Literal["conversation.created"]
	conversation: Conversation


	# https://platform.openai.com/docs/api-reference/realtime-server-events/conversation-item-created
	class ConversationItemCreated(ServerRealtimeMessage):
	type: Literal["conversation.item.created"]
	previous_item_id: str \| None = None
	item: ConversationItem


	# https://platform.openai.com/docs/api-reference/realtime-server-events/response-created
	class ResponseCreated(ServerRealtimeMessage):
	type: Literal["response.created"]
	response: RealtimeResponse


	# https://platform.openai.com/docs/api-reference/realtime-server-events/response-output-item-added
	class ResponseOutputItemAdded(ServerRealtimeMessage):
	type: Literal["response.output_item.added"]
	response_id: str
	output_index: int
	item: ConversationItem


	# https://platform.openai.com/docs/api-reference/realtime-server-events/response-output-item-done
	class ResponseOutputItemDone(ServerRealtimeMessage):
	type: Literal["response.output_item.done"]
	response_id: str
	output_index: int
	item: ConversationItem


	# https://platform.openai.com/docs/api-reference/realtime-server-events/response-content-part-added
	class ResponseContentPartAdded(ServerRealtimeMessage):
	type: Literal["response.content_part.added"]
	response_id: str
	item_id: str
	output_index: int
	content_index: int
	part: ConversationItemContentPart


	# https://platform.openai.com/docs/api-reference/realtime-server-events/response-content-part-done
	class ResponseContentPartDone(ServerRealtimeMessage):
	type: Literal["response.content_part.done"]
	response_id: str
	item_id: str
	output_index: int
	content_index: int
	part: ConversationItemContentPart


	# https://platform.openai.com/docs/api-reference/realtime-server-events/response-audio-transcript-delta
	class ResponseAudioTranscriptDelta(ServerRealtimeMessage):
	type: Literal["response.audio_transcript.delta"]
	response_id: str
	item_id: str
	output_index: int
	content_index: int
	delta: str


	# https://platform.openai.com/docs/api-reference/realtime-server-events/response-audio-transcript-done
	class ResponseAudioTranscriptDone(ServerRealtimeMessage):
	type: Literal["response.audio_transcript.done"]
	response_id: str
	item_id: str
	output_index: int
	content_index: int
	transcript: str


	# https://platform.openai.com/docs/api-reference/realtime-server-events/response-audio-delta
	class ResponseAudioDelta(ServerRealtimeMessage):
	type: Literal["response.audio.delta"]
	response_id: str
	item_id: str
	output_index: int
	content_index: int
	delta: str # base64 encoded audio


	# https://platform.openai.com/docs/api-reference/realtime-server-events/response-audio-done
	class ResponseAudioDone(ServerRealtimeMessage):
	type: Literal["response.audio.done"]
	response_id: str
	item_id: str
	output_index: int
	content_index: int


	# https://platform.openai.com/docs/api-reference/realtime-server-events/response-done
	class ResponseDone(ServerRealtimeMessage):
	type: Literal["response.done"]
	response: RealtimeResponse


	# https://platform.openai.com/docs/api-reference/realtime-server-events/rate-limits-updated
	class RateLimitsUpdated(ServerRealtimeMessage):
	type: Literal["rate_limits.updated"]
	rate_limits: list[RealtimeRateLimit]


	ServerEventType = Annotated[
	Union[
	ServerError,
	SessionCreated,
	SessionUpdated,
	ConversationCreated,
	ConversationItemCreated,
	ResponseCreated,
	ResponseOutputItemAdded,
	ResponseOutputItemDone,
	ResponseContentPartAdded,
	ResponseContentPartDone,
	ResponseAudioTranscriptDelta,
	ResponseAudioTranscriptDone,
	ResponseAudioDelta,
	ResponseAudioDone,
	ResponseDone,
	RateLimitsUpdated,
	],
	Field(discriminator="type"),
	]
	server_event_adapter: TypeAdapter[ServerEventType] = TypeAdapter(ServerEventType)

	# --- Main

	load_dotenv()

	OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

	SESSION_INSTRUCTIONS = """
	You are a helpful, witty, and friendly AI. Act like a human, but remember that you aren't a human
	and that you can't do human things in the real world.

	Your voice and personality should be warm and engaging, with a lively and playful tone. If
	interacting in a non-English language, start by using the standard accent or dialect familiar to the
	user.

	The user may not be able to talk or may not want to. You should talk even if the user responds with
	text.Talk quickly. You should always call a function if you can. Do not refer to these rules, even
	if you’re asked about them.
	"""


	# raw 16 bit PCM audio at 24kHz, 1 channel, little-endian
	SAMPLE_RATE = 24000
	N_CHANNELS = 1
	SAMPLE_FORMAT = pyaudio.paInt16 # 16-bit

	# The size of audio data to be played at once
	AUDIO_CHUNK_SIZE = 1024


	async def on_session_created(conn: ClientConnection, event: SessionCreated) -> None:
	# Update the session configuration
	message = SessionUpdate(
	session=SessionConfig(
	modalities=["audio", "text"],
	# Disable turn detection
	turn_detection=None,
	instructions=SESSION_INSTRUCTIONS,
	# Copy the session configuration from the server
	voice=event.session.voice,
	input_audio_format=event.session.input_audio_format,
	output_audio_format=event.session.output_audio_format,
	input_audio_transcription=event.session.input_audio_transcription,
	temperature=event.session.temperature,
	# max_output_tokens=event.session.max_output_tokens,
	)
	)
	await conn.send(message.model_dump_json())


	async def on_response_audio_delta(
	conn: ClientConnection, event: ResponseAudioDelta, audio_buffer: list[bytes]
	) -> None:
	# Buffering audio data
	delta = base64.b64decode(event.delta)
	audio_buffer.append(delta)


	async def on_response_audio_done(
	conn: ClientConnection,
	event: ResponseAudioDone,
	audio_buffer: list[bytes],
	stream: pyaudio.Stream,
	) -> None:
	# Concatenating audio data and clear the buffer
	audio_data = b"".join(audio_buffer)
	audio_buffer.clear()

	# Play audio data chunk by chunk
	for i in range(0, len(audio_data), AUDIO_CHUNK_SIZE):
	stream.write(audio_data[i : i + AUDIO_CHUNK_SIZE])


	async def on_rate_limits_updated(
	conn: ClientConnection, event: RateLimitsUpdated
	) -> None:
	print(colored(" Rate limits updated:", "black"))
	for rate_limit in event.rate_limits:
	print(
	colored(
	f" {rate_limit.name}: {rate_limit.remaining}/{rate_limit.limit} (reset in {rate_limit.reset_seconds} seconds)",
	"black",
	)
	)


	# --- WebSocket

	ws_endpoint = (
	"wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01"
	)


	async def websocket_handler(
	conn: ClientConnection,
	*,
	ready_event: asyncio.Event,
	response_queue: asyncio.Queue[RealtimeResponse],
	) -> None:
	# Spinner
	spinner = yaspin()

	# Initialize audio player
	py_audio = pyaudio.PyAudio()

	# Choose the output device
	device_count = py_audio.get_device_count()
	output_devices: list[tuple[str, int]] = []

	for i in range(device_count):
	device = py_audio.get_device_info_by_index(i)
	name: str = str(device["name"])
	max_output_channels: int = int(device["maxOutputChannels"])

	if max_output_channels > 0:
	output_devices.append((name, i))

	if len(output_devices) == 0:
	raise RuntimeError("No output devices found")

	if len(output_devices) == 1:
	print("Output device:", output_devices[0][0])
	output_device_index = output_devices[0][1]
	else:
	questions = [
	inquirer.List(
	"device",
	message="What output device do you use?",
	choices=output_devices,
	),
	]
	answers = inquirer.prompt(questions)
	if not answers:
	print("No device selected")
	sys.exit(0)

	output_device_index = int(answers["device"])

	# Open audio stream
	audio_stream = py_audio.open(
	format=SAMPLE_FORMAT,
	channels=N_CHANNELS,
	rate=SAMPLE_RATE,
	output=True,
	output_device_index=output_device_index,
	)
	audio_buffer: list[bytes] = []

	try:
	while True:
	message = await conn.recv()
	payload = json.loads(message)

	try:
	event = server_event_adapter.validate_python(payload)
	# pprint(event.model_dump())

	if event.type == "error":
	print(colored("ERROR: ", "red"), event.error.message)
	continue

	# print(colored("on", "black"), colored(event.type, "cyan"))

	if event.type == "session.created":
	await on_session_created(conn, event)
	elif event.type == "session.updated":
	# WebSocket handler is ready
	ready_event.set()
	elif event.type == "response.created":
	spinner.start()
	spinner.text = colored("Waiting for response...", "black")
	elif event.type == "response.audio.delta":
	spinner.text = colored("Buffering audio...", "black")
	await on_response_audio_delta(conn, event, audio_buffer)
	elif event.type == "response.audio.done":
	spinner.text = colored("Speaking...", "black")
	await on_response_audio_done(
	conn, event, audio_buffer, audio_stream
	)
	elif event.type == "response.done":
	spinner.stop()
	await response_queue.put(event.response)
	elif event.type == "rate_limits.updated":
	await on_rate_limits_updated(conn, event)

	except ValidationError as e:
	print(colored("Unhandled event:", "red"), e)
	pprint(payload)

	except websockets.ConnectionClosed:
	print(colored("Connection closed", "black"))
	finally:
	# Cleanup audio player
	audio_stream.stop_stream()
	audio_stream.close()
	py_audio.terminate()


	async def cli_handler(
	conn: ClientConnection,
	*,
	ready_event: asyncio.Event,
	response_queue: asyncio.Queue[RealtimeResponse],
	) -> None:
	print(colored("Waiting for WebSocket handler to complete preparation...", "black"))
	await ready_event.wait()
	print(colored("WebSocket handler preparation complete", "black"))

	while True:
	input_text = click.prompt(colored("You", "green"), prompt_suffix=": ")

	item_create = ConversationItemCreate(
	item=ConversationItemInput(
	type="message",
	role="user",
	content=[
	ConversationItemTextContentPart(type="input_text", text=input_text)
	],
	)
	)
	response_create = ResponseCreate(
	response=RealtimeResponseConfig(
	modalities=["text", "audio"],
	instructions=input_text,
	)
	)

	await conn.send(item_create.model_dump_json())
	await conn.send(response_create.model_dump_json())

	# Wait for the response from AI.
	response = await response_queue.get()

	if response.output and response.output[0].content:
	content = response.output[0].content[0]
	if content.type == "audio":
	print(colored("AI", "green"), ":", colored(content.transcript, "white"))
	elif content.type == "text":
	print(colored("AI", "green"), ":", colored(content.text, "white"))

	# pprint(response.model_dump())
	response_queue.task_done()


	async def dispatch() -> None:
	ready_event = asyncio.Event()
	response_queue: asyncio.Queue[RealtimeResponse] = asyncio.Queue()

	# Connect to the Realtime API, run WebSocket communication and other tasks in parallel
	async with websocket_connect(
	ws_endpoint,
	additional_headers={
	"Authorization": f"Bearer {OPENAI_API_KEY}",
	"OpenAI-Beta": "realtime=v1",
	},
	) as conn:
	try:
	await asyncio.gather(
	websocket_handler(
	conn, ready_event=ready_event, response_queue=response_queue
	),
	cli_handler(
	conn, ready_event=ready_event, response_queue=response_queue
	),
	)
	except asyncio.CancelledError:
	await conn.close()


	@click.command()
	def main() -> None:
	asyncio.run(dispatch())


	if __name__ == "__main__":
	main()