Created
November 10, 2025 13:00
-
-
Save amosgyamfi/62a2f4d1543803cf3c5c5c213b45be03 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import asyncio | |
| import logging | |
| from uuid import uuid4 | |
| from dotenv import load_dotenv | |
| from vision_agents.core.edge.types import User | |
| from vision_agents.core.agents import Agent | |
| from vision_agents.plugins import getstream, openrouter, elevenlabs, deepgram, smart_turn, moondream | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s [call_id=%(call_id)s] %(name)s: %(message)s") | |
| logger = logging.getLogger(__name__) | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| async def start_agent() -> None: | |
| # Create Moondream CloudDetectionProcessor for object detection | |
| # API key is read from MOONDREAM_API_KEY environment variable | |
| moondream_processor = moondream.CloudDetectionProcessor( | |
| detect_objects=["person", "yoga mat", "water bottle", "chair", "table", "book", "phone", "laptop", "cup", "bag"], | |
| conf_threshold=0.3, | |
| fps=30 | |
| ) | |
| # create an agent to run with Stream's edge, Kimi K2 thinking model | |
| agent = Agent( | |
| edge=getstream.Edge(), # low latency edge. clients for React, iOS, Android, RN, Flutter etc. | |
| agent_user=User(name="Moondream Vision Assistant", id="agent"), # the user object for the agent (name, image etc) | |
| instructions="Read @agent_instructions.md", | |
| processors=[moondream_processor], # Moondream processor for real-time object detection with bounding boxes | |
| # Kimi K2 thinking model for advanced reasoning | |
| llm=openrouter.LLM( | |
| model="moonshotai/kimi-k2-thinking", | |
| ), | |
| # Use ElevenLabs for natural text-to-speech | |
| tts=elevenlabs.TTS(), | |
| # Deepgram for speech recognition | |
| stt=deepgram.STT(), | |
| # Smart turn detection for natural conversation | |
| turn_detection=smart_turn.TurnDetection(), | |
| ) | |
| logger.info("ποΈ Starting Moondream Object Detection Agent with Kimi K2 Reasoning Model...") | |
| await agent.create_user() | |
| logger.info("β Agent user created") | |
| # Create a call | |
| call = agent.edge.client.video.call("default", str(uuid4())) | |
| logger.info("β Call created") | |
| # Open the demo UI | |
| await agent.edge.open_demo(call) | |
| logger.info("β Demo UI opened") | |
| # Have the agent join the call/room | |
| with await agent.join(call): | |
| await asyncio.sleep(3) | |
| # Greet the user and explain vision capabilities | |
| greeting = """Hello! I'm your vision assistant, enhanced with advanced reasoning capabilities. | |
| I can see your camera feed in real-time and detect objects with green bounding boxes appearing around them. | |
| I can identify people, yoga mats, water bottles, furniture, and various everyday objects. | |
| What would you like me to help you find or monitor?""" | |
| await agent.llm.simple_response(text=greeting) | |
| logger.info("β Agent ready and waiting for user interaction") | |
| # Keep the agent running and wait for user interaction | |
| # The agent will handle the conversation until the user closes the connection | |
| try: | |
| await agent.finish() | |
| except Exception as e: | |
| logger.info(f"Agent session ended: {e}") | |
| # Allow time for graceful cleanup of WebSocket and media tracks | |
| await asyncio.sleep(1) | |
| if __name__ == "__main__": | |
| asyncio.run(start_agent()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment