amosgyamfi · November 10, 2025 13:00
diff --git a/modular_voice_pipeline.py b/modular_voice_pipeline.py
 import asyncio
 import logging
 from uuid import uuid4

 from dotenv import load_dotenv

 from vision_agents.core.edge.types import User
 from vision_agents.core.agents import Agent
 from vision_agents.plugins import getstream, openrouter, elevenlabs, deepgram, smart_turn, moondream

 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s [call_id=%(call_id)s] %(name)s: %(message)s")
 logger = logging.getLogger(__name__)

 # Load environment variables from .env file
 load_dotenv()

 async def start_agent() -> None:
  
    # Create Moondream CloudDetectionProcessor for object detection
    # API key is read from MOONDREAM_API_KEY environment variable
    moondream_processor = moondream.CloudDetectionProcessor(
        detect_objects=["person", "yoga mat", "water bottle", "chair", "table", "book", "phone", "laptop", "cup", "bag"],
        conf_threshold=0.3,
        fps=30
    )
    
    # create an agent to run with Stream's edge, Kimi K2 thinking model
    agent = Agent(
        edge=getstream.Edge(),  # low latency edge. clients for React, iOS, Android, RN, Flutter etc.
        agent_user=User(name="Moondream Vision Assistant", id="agent"),  # the user object for the agent (name, image etc)
        instructions="Read @agent_instructions.md",
        
        processors=[moondream_processor],  # Moondream processor for real-time object detection with bounding boxes
        
        # Kimi K2 thinking model for advanced reasoning
        llm=openrouter.LLM(
            model="moonshotai/kimi-k2-thinking",
        ),
        
        # Use ElevenLabs for natural text-to-speech
        tts=elevenlabs.TTS(),
        
        # Deepgram for speech recognition
        stt=deepgram.STT(),
        
        # Smart turn detection for natural conversation
        turn_detection=smart_turn.TurnDetection(),
    )

    logger.info("👁️ Starting Moondream Object Detection Agent with Kimi K2 Reasoning Model...")
    await agent.create_user()
    logger.info("✅ Agent user created")

    # Create a call
    call = agent.edge.client.video.call("default", str(uuid4()))
    logger.info("✅ Call created")
    
    # Open the demo UI
    await agent.edge.open_demo(call)
    logger.info("✅ Demo UI opened")

    # Have the agent join the call/room
    with await agent.join(call):
        await asyncio.sleep(3)
        
        # Greet the user and explain vision capabilities
        greeting = """Hello! I'm your vision assistant, enhanced with advanced reasoning capabilities. 
        I can see your camera feed in real-time and detect objects with green bounding boxes appearing around them. 
        I can identify people, yoga mats, water bottles, furniture, and various everyday objects. 
        What would you like me to help you find or monitor?"""
        
        await agent.llm.simple_response(text=greeting)
        logger.info("✅ Agent ready and waiting for user interaction")
        
        # Keep the agent running and wait for user interaction
        # The agent will handle the conversation until the user closes the connection
        try:
            await agent.finish()
        except Exception as e:
            logger.info(f"Agent session ended: {e}")
        
        # Allow time for graceful cleanup of WebSocket and media tracks
        await asyncio.sleep(1)


 if __name__ == "__main__":
    asyncio.run(start_agent())
	import asyncio
	import logging
	from uuid import uuid4

	from dotenv import load_dotenv

	from vision_agents.core.edge.types import User
	from vision_agents.core.agents import Agent
	from vision_agents.plugins import getstream, openrouter, elevenlabs, deepgram, smart_turn, moondream

	logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s [call_id=%(call_id)s] %(name)s: %(message)s")
	logger = logging.getLogger(__name__)

	# Load environment variables from .env file
	load_dotenv()

	async def start_agent() -> None:

	# Create Moondream CloudDetectionProcessor for object detection
	# API key is read from MOONDREAM_API_KEY environment variable
	moondream_processor = moondream.CloudDetectionProcessor(
	detect_objects=["person", "yoga mat", "water bottle", "chair", "table", "book", "phone", "laptop", "cup", "bag"],
	conf_threshold=0.3,
	fps=30
	)

	# create an agent to run with Stream's edge, Kimi K2 thinking model
	agent = Agent(
	edge=getstream.Edge(), # low latency edge. clients for React, iOS, Android, RN, Flutter etc.
	agent_user=User(name="Moondream Vision Assistant", id="agent"), # the user object for the agent (name, image etc)
	instructions="Read @agent_instructions.md",

	processors=[moondream_processor], # Moondream processor for real-time object detection with bounding boxes

	# Kimi K2 thinking model for advanced reasoning
	llm=openrouter.LLM(
	model="moonshotai/kimi-k2-thinking",
	),

	# Use ElevenLabs for natural text-to-speech
	tts=elevenlabs.TTS(),

	# Deepgram for speech recognition
	stt=deepgram.STT(),

	# Smart turn detection for natural conversation
	turn_detection=smart_turn.TurnDetection(),
	)

	logger.info("👁️ Starting Moondream Object Detection Agent with Kimi K2 Reasoning Model...")
	await agent.create_user()
	logger.info("✅ Agent user created")

	# Create a call
	call = agent.edge.client.video.call("default", str(uuid4()))
	logger.info("✅ Call created")

	# Open the demo UI
	await agent.edge.open_demo(call)
	logger.info("✅ Demo UI opened")

	# Have the agent join the call/room
	with await agent.join(call):
	await asyncio.sleep(3)

	# Greet the user and explain vision capabilities
	greeting = """Hello! I'm your vision assistant, enhanced with advanced reasoning capabilities.
	I can see your camera feed in real-time and detect objects with green bounding boxes appearing around them.
	I can identify people, yoga mats, water bottles, furniture, and various everyday objects.
	What would you like me to help you find or monitor?"""

	await agent.llm.simple_response(text=greeting)
	logger.info("✅ Agent ready and waiting for user interaction")

	# Keep the agent running and wait for user interaction
	# The agent will handle the conversation until the user closes the connection
	try:
	await agent.finish()
	except Exception as e:
	logger.info(f"Agent session ended: {e}")

	# Allow time for graceful cleanup of WebSocket and media tracks
	await asyncio.sleep(1)


	if __name__ == "__main__":
	asyncio.run(start_agent())
No results found