noahlt · January 15, 2025 17:21
diff --git a/cartesia-assistant.py b/cartesia-assistant.py
 import asyncio
 import json

 from livekit import rtc
 from livekit.agents import JobContext, WorkerOptions, cli, JobProcess
 from livekit.agents.llm import (
    ChatContext,
    ChatMessage,
 )
 from livekit.agents.voice_assistant import VoiceAssistant
 from livekit.agents.log import logger
 from livekit.plugins import deepgram, silero, cartesia, openai
 from pydantic import BaseModel
 from typing import Optional, List
 from datetime import datetime

 from dotenv import load_dotenv

 load_dotenv()


 class Voice(BaseModel):
    id: str
    user_id: Optional[str] = None
    is_public: bool
    name: str
    description: str
    created_at: datetime
    embedding: List[float]


 def prewarm(proc: JobProcess):
    proc.userdata["vad"] = silero.VAD.load()


 async def entrypoint(ctx: JobContext):
    initial_ctx = ChatContext(
        messages=[
            ChatMessage(
                role="system",
                content="You are a voice assistant created by LiveKit. Your interface with users will be voice. Pretend we're having a conversation, no special formatting or headings, just natural speech.",
            )
        ]
    )

    tts = cartesia.TTS(
        model="sonic",
        voice="248be419-c632-4f23-adf1-5324ed7dbf1d",
    )
    assistant = VoiceAssistant(
        vad=ctx.proc.userdata["vad"],
        stt=deepgram.STT(),
        llm=openai.LLM(model="gpt-4o-mini"),
        tts=tts,
        chat_ctx=initial_ctx,
    )

    is_user_speaking = False
    is_agent_speaking = False

    @ctx.room.on("participant_attributes_changed")
    def on_participant_attributes_changed(
        changed_attributes: dict[str, str], participant: rtc.Participant
    ):
        # ignore agent state changes
        if participant == ctx.room.local_participant:
            return

        if "voice" in changed_attributes:
            voice = changed_attributes["voice"]
            logger.info(
                f"participant {participant.identity} requested voice change: {voice}"
            )
            voice_data = json.loads(voice)
            if "embedding" in voice_data:
                model = "sonic-english"
                language = "en"
                if "language" in voice_data and voice_data["language"] != "en":
                    language = voice_data["language"]
                    model = "sonic-multilingual"
                tts._opts.voice = voice_data["embedding"]
                tts._opts.model = model
                tts._opts.language = language
                if not (is_agent_speaking or is_user_speaking):
                    asyncio.create_task(
                        assistant.say("How do I sound now?", allow_interruptions=True)
                    )

    await ctx.connect()

    @assistant.on("agent_started_speaking")
    def agent_started_speaking():
        nonlocal is_agent_speaking
        is_agent_speaking = True

    @assistant.on("agent_stopped_speaking")
    def agent_stopped_speaking():
        nonlocal is_agent_speaking
        is_agent_speaking = False

    @assistant.on("user_started_speaking")
    def user_started_speaking():
        nonlocal is_user_speaking
        is_user_speaking = True

    @assistant.on("user_stopped_speaking")
    def user_stopped_speaking():
        nonlocal is_user_speaking
        is_user_speaking = False

    assistant.start(ctx.room)

    await asyncio.sleep(1)

    await assistant.say("Hi there, how are you doing today?", allow_interruptions=True)


 if __name__ == "__main__":
    cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint, prewarm_fnc=prewarm))
	import asyncio
	import json

	from livekit import rtc
	from livekit.agents import JobContext, WorkerOptions, cli, JobProcess
	from livekit.agents.llm import (
	ChatContext,
	ChatMessage,
	)
	from livekit.agents.voice_assistant import VoiceAssistant
	from livekit.agents.log import logger
	from livekit.plugins import deepgram, silero, cartesia, openai
	from pydantic import BaseModel
	from typing import Optional, List
	from datetime import datetime

	from dotenv import load_dotenv

	load_dotenv()


	class Voice(BaseModel):
	id: str
	user_id: Optional[str] = None
	is_public: bool
	name: str
	description: str
	created_at: datetime
	embedding: List[float]


	def prewarm(proc: JobProcess):
	proc.userdata["vad"] = silero.VAD.load()


	async def entrypoint(ctx: JobContext):
	initial_ctx = ChatContext(
	messages=[
	ChatMessage(
	role="system",
	content="You are a voice assistant created by LiveKit. Your interface with users will be voice. Pretend we're having a conversation, no special formatting or headings, just natural speech.",
	)
	]
	)

	tts = cartesia.TTS(
	model="sonic",
	voice="248be419-c632-4f23-adf1-5324ed7dbf1d",
	)
	assistant = VoiceAssistant(
	vad=ctx.proc.userdata["vad"],
	stt=deepgram.STT(),
	llm=openai.LLM(model="gpt-4o-mini"),
	tts=tts,
	chat_ctx=initial_ctx,
	)

	is_user_speaking = False
	is_agent_speaking = False

	@ctx.room.on("participant_attributes_changed")
	def on_participant_attributes_changed(
	changed_attributes: dict[str, str], participant: rtc.Participant
	):
	# ignore agent state changes
	if participant == ctx.room.local_participant:
	return

	if "voice" in changed_attributes:
	voice = changed_attributes["voice"]
	logger.info(
	f"participant {participant.identity} requested voice change: {voice}"
	)
	voice_data = json.loads(voice)
	if "embedding" in voice_data:
	model = "sonic-english"
	language = "en"
	if "language" in voice_data and voice_data["language"] != "en":
	language = voice_data["language"]
	model = "sonic-multilingual"
	tts._opts.voice = voice_data["embedding"]
	tts._opts.model = model
	tts._opts.language = language
	if not (is_agent_speaking or is_user_speaking):
	asyncio.create_task(
	assistant.say("How do I sound now?", allow_interruptions=True)
	)

	await ctx.connect()

	@assistant.on("agent_started_speaking")
	def agent_started_speaking():
	nonlocal is_agent_speaking
	is_agent_speaking = True

	@assistant.on("agent_stopped_speaking")
	def agent_stopped_speaking():
	nonlocal is_agent_speaking
	is_agent_speaking = False

	@assistant.on("user_started_speaking")
	def user_started_speaking():
	nonlocal is_user_speaking
	is_user_speaking = True

	@assistant.on("user_stopped_speaking")
	def user_stopped_speaking():
	nonlocal is_user_speaking
	is_user_speaking = False

	assistant.start(ctx.room)

	await asyncio.sleep(1)

	await assistant.say("Hi there, how are you doing today?", allow_interruptions=True)


	if __name__ == "__main__":
	cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint, prewarm_fnc=prewarm))