Skip to content

Instantly share code, notes, and snippets.

@noahlt
Forked from davidzhao/cartesia-assistant.py
Last active January 15, 2025 17:21
Show Gist options
  • Save noahlt/966daec525f963a471a6ea62c51e7c12 to your computer and use it in GitHub Desktop.
Save noahlt/966daec525f963a471a6ea62c51e7c12 to your computer and use it in GitHub Desktop.
LiveKit Voice Assistant with Cartesia
import asyncio
import json
from livekit import rtc
from livekit.agents import JobContext, WorkerOptions, cli, JobProcess
from livekit.agents.llm import (
ChatContext,
ChatMessage,
)
from livekit.agents.voice_assistant import VoiceAssistant
from livekit.agents.log import logger
from livekit.plugins import deepgram, silero, cartesia, openai
from pydantic import BaseModel
from typing import Optional, List
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()
class Voice(BaseModel):
id: str
user_id: Optional[str] = None
is_public: bool
name: str
description: str
created_at: datetime
embedding: List[float]
def prewarm(proc: JobProcess):
proc.userdata["vad"] = silero.VAD.load()
async def entrypoint(ctx: JobContext):
initial_ctx = ChatContext(
messages=[
ChatMessage(
role="system",
content="You are a voice assistant created by LiveKit. Your interface with users will be voice. Pretend we're having a conversation, no special formatting or headings, just natural speech.",
)
]
)
tts = cartesia.TTS(
model="sonic",
voice="248be419-c632-4f23-adf1-5324ed7dbf1d",
)
assistant = VoiceAssistant(
vad=ctx.proc.userdata["vad"],
stt=deepgram.STT(),
llm=openai.LLM(model="gpt-4o-mini"),
tts=tts,
chat_ctx=initial_ctx,
)
is_user_speaking = False
is_agent_speaking = False
@ctx.room.on("participant_attributes_changed")
def on_participant_attributes_changed(
changed_attributes: dict[str, str], participant: rtc.Participant
):
# ignore agent state changes
if participant == ctx.room.local_participant:
return
if "voice" in changed_attributes:
voice = changed_attributes["voice"]
logger.info(
f"participant {participant.identity} requested voice change: {voice}"
)
voice_data = json.loads(voice)
if "embedding" in voice_data:
model = "sonic-english"
language = "en"
if "language" in voice_data and voice_data["language"] != "en":
language = voice_data["language"]
model = "sonic-multilingual"
tts._opts.voice = voice_data["embedding"]
tts._opts.model = model
tts._opts.language = language
if not (is_agent_speaking or is_user_speaking):
asyncio.create_task(
assistant.say("How do I sound now?", allow_interruptions=True)
)
await ctx.connect()
@assistant.on("agent_started_speaking")
def agent_started_speaking():
nonlocal is_agent_speaking
is_agent_speaking = True
@assistant.on("agent_stopped_speaking")
def agent_stopped_speaking():
nonlocal is_agent_speaking
is_agent_speaking = False
@assistant.on("user_started_speaking")
def user_started_speaking():
nonlocal is_user_speaking
is_user_speaking = True
@assistant.on("user_stopped_speaking")
def user_stopped_speaking():
nonlocal is_user_speaking
is_user_speaking = False
assistant.start(ctx.room)
await asyncio.sleep(1)
await assistant.say("Hi there, how are you doing today?", allow_interruptions=True)
if __name__ == "__main__":
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint, prewarm_fnc=prewarm))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment