Skip to content

Instantly share code, notes, and snippets.

@amosgyamfi
Created November 10, 2025 13:00
Show Gist options
  • Select an option

  • Save amosgyamfi/62a2f4d1543803cf3c5c5c213b45be03 to your computer and use it in GitHub Desktop.

Select an option

Save amosgyamfi/62a2f4d1543803cf3c5c5c213b45be03 to your computer and use it in GitHub Desktop.
import asyncio
import logging
from uuid import uuid4
from dotenv import load_dotenv
from vision_agents.core.edge.types import User
from vision_agents.core.agents import Agent
from vision_agents.plugins import getstream, openrouter, elevenlabs, deepgram, smart_turn, moondream
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s [call_id=%(call_id)s] %(name)s: %(message)s")
logger = logging.getLogger(__name__)
# Load environment variables from .env file
load_dotenv()
async def start_agent() -> None:
# Create Moondream CloudDetectionProcessor for object detection
# API key is read from MOONDREAM_API_KEY environment variable
moondream_processor = moondream.CloudDetectionProcessor(
detect_objects=["person", "yoga mat", "water bottle", "chair", "table", "book", "phone", "laptop", "cup", "bag"],
conf_threshold=0.3,
fps=30
)
# create an agent to run with Stream's edge, Kimi K2 thinking model
agent = Agent(
edge=getstream.Edge(), # low latency edge. clients for React, iOS, Android, RN, Flutter etc.
agent_user=User(name="Moondream Vision Assistant", id="agent"), # the user object for the agent (name, image etc)
instructions="Read @agent_instructions.md",
processors=[moondream_processor], # Moondream processor for real-time object detection with bounding boxes
# Kimi K2 thinking model for advanced reasoning
llm=openrouter.LLM(
model="moonshotai/kimi-k2-thinking",
),
# Use ElevenLabs for natural text-to-speech
tts=elevenlabs.TTS(),
# Deepgram for speech recognition
stt=deepgram.STT(),
# Smart turn detection for natural conversation
turn_detection=smart_turn.TurnDetection(),
)
logger.info("πŸ‘οΈ Starting Moondream Object Detection Agent with Kimi K2 Reasoning Model...")
await agent.create_user()
logger.info("βœ… Agent user created")
# Create a call
call = agent.edge.client.video.call("default", str(uuid4()))
logger.info("βœ… Call created")
# Open the demo UI
await agent.edge.open_demo(call)
logger.info("βœ… Demo UI opened")
# Have the agent join the call/room
with await agent.join(call):
await asyncio.sleep(3)
# Greet the user and explain vision capabilities
greeting = """Hello! I'm your vision assistant, enhanced with advanced reasoning capabilities.
I can see your camera feed in real-time and detect objects with green bounding boxes appearing around them.
I can identify people, yoga mats, water bottles, furniture, and various everyday objects.
What would you like me to help you find or monitor?"""
await agent.llm.simple_response(text=greeting)
logger.info("βœ… Agent ready and waiting for user interaction")
# Keep the agent running and wait for user interaction
# The agent will handle the conversation until the user closes the connection
try:
await agent.finish()
except Exception as e:
logger.info(f"Agent session ended: {e}")
# Allow time for graceful cleanup of WebSocket and media tracks
await asyncio.sleep(1)
if __name__ == "__main__":
asyncio.run(start_agent())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment