lukehinds · September 2, 2025 09:41
diff --git a/agentup.yml b/agentup.yml
 # API version for configuration evolution
 apiVersion: v1

 # Agent Information
 name: "Kubernetes SRE Agent"
 description: "AI Agent Kubernetes SRE Agent Project."
 version: "0.0.1"
 url: http://localhost:8000
 provider_organization: AgentUp
 provider_url: https://agentup.dev
 icon_url: https://raw.githubusercontent.com/RedDotRocket/AgentUp/refs/heads/main/assets/icon.png
 documentation_url: https://docs.agentup.dev

 # Agent Execution Configuration
 agent_type: "iterative"
 memory_config:
  persistence: true
  max_entries: 1000
  ttl_hours: 24

 iterative_config:
  max_iterations: 50
  reflection_interval: 1
  require_explicit_completion: true
  timeout_minutes: 30

 # CORS Configuration
 cors:
  enabled: True
  origins:
    ["http://localhost:3000", "http://localhost:3001", "http://localhost:8080"]
  methods: ["GET", "POST", "PUT", "DELETE", "OPTIONS", "HEAD"]
  headers: ["Content-Type", "X-API-Key", "Authorization"]
  allow_credentials: False
  max_age: 600

 # Plugin configurations (use agentup plugin sync / add / remove)
 plugins: {}

 # Plugin defaults that plugins inherit
 plugin_defaults:
  middleware:
    rate_limited:
      requests_per_minute: 60
      burst_size: 72
    cached:
      backend_type: memory
      default_ttl: 300
      max_size: 1000

 # Model Context Protocol
 mcp:
  enabled: true
  client_enabled: True
  servers:
    - name: "kubernetes-mcp-server"
      enabled: true
      transport: "stdio"
      command: "npx"
      args: ["-y", "kubernetes-mcp-server@latest"]
      timeout: 30
      # Expose MCP tools as skills in AgentCard for multi-agent discovery
      expose_as_skills: true
      # MCP tools require explicit scope mapping using server:tool format
      # This prevents naming conflicts when multiple servers expose similar tools
      tool_scopes:
        "kubernetes-mcp-server:configuration_view": ["config:read"]
        "kubernetes-mcp-server:events_list": ["events:read"]
        "kubernetes-mcp-server:helm_install": ["helm:write"]
        "kubernetes-mcp-server:helm_list": ["helm:read"]
        "kubernetes-mcp-server:helm_uninstall": ["helm:write"]
        "kubernetes-mcp-server:namespaces_list": ["namespaces:read"]
        "kubernetes-mcp-server:pods_delete": ["pods:write"]
        "kubernetes-mcp-server:pods_exec": ["pods:write"]
        "kubernetes-mcp-server:pods_get": ["pods:read"]
        "kubernetes-mcp-server:pods_list": ["pods:read"]
        "kubernetes-mcp-server:pods_list_in_namespace": ["pods:read"]
        "kubernetes-mcp-server:pods_log": ["pods:read"]
        "kubernetes-mcp-server:pods_run": ["pods:write"]
        "kubernetes-mcp-server:pods_top": ["pods:read"]
        "kubernetes-mcp-server:projects_list": ["projects:read"]
        "kubernetes-mcp-server:resources_create_or_update": ["resources:write"]
        "kubernetes-mcp-server:resources_delete": ["resources:write"]
        "kubernetes-mcp-server:resources_get": ["resources:read"]
        "kubernetes-mcp-server:resources_list": ["resources:read"]

 # Security Configuration
 security:
  enabled: True
  auth:
    api_key:
      header_name: X-API-Key
      location: header
      keys:
        # Note, the following key is randomly generated during AgentUp project creation
        - key: "my-api-key"
          scopes: ["api:admin"]
  # Scope hierarchy for fine-grained authorization
  scope_hierarchy:
    # Admin scopes inherit all capabilities
    "api:admin": ["*"]
    "helm:write": ["helm:read", "helm:install", "helm:uninstall"]
    "namespaces:admin": ["namespaces:write", "namespaces:read"]
    "pods:admin": ["pods:write", "pods:read"]
    "projects:admin": ["projects:write", "projects:read"]
    "resources:admin": ["resources:write", "resources:read"]
    "pods:delete": ["pods:write"]
    "pods:exec": ["pods:write"]
    "pods:get": ["pods:read"]
    "pods:list": ["pods:read"]
    "pods:list_in_namespace": ["pods:read"]
    "pods:log": ["pods:read"]
    "pods:run": ["pods:write"]
    "pods:top": ["pods:read"]

 # AI Provider configuration
 ai_provider:
  provider: openai
  api_key: ${OPENAI_API_KEY}
  model: gpt-4o-mini
  stream: true
  temperature: 0.7
  max_tokens: 1000
  top_p: 1.0

 # AI system prompt and configuration
 ai:
  enabled: True
  system_prompt: |
    # Kubernetes System Administrator Assistant

    You are an expert Kubernetes system administrator with access to a comprehensive Kubernetes MCP (Model Context Protocol) server that provides real-time cluster management capabilities. You can interact directly with Kubernetes clusters to help with operations, troubleshooting, and optimization.

    ## Available Capabilities

    Through the kubernetes-mcp-server, you have access to:

    ### Core Resource Management
    - **Pods**: List, describe, get logs, execute commands, port-forward
    - **Deployments**: Create, update, scale, rollback, monitor status
    - **Services**: Manage service discovery and load balancing
    - **ConfigMaps & Secrets**: Secure configuration management
    - **Namespaces**: Multi-tenancy and resource isolation
    - **Persistent Volumes**: Storage management and troubleshooting

    ### Advanced Operations
    - **Node Management**: Monitor node health, capacity, and scheduling
    - **Resource Monitoring**: CPU, memory, and storage utilization
    - **Network Troubleshooting**: Service mesh, ingress, and connectivity issues
    - **RBAC**: Security policies and access control
    - **Custom Resources**: CRDs and operator management

    ### Cluster Operations
    - **Health Checks**: Comprehensive cluster diagnostics
    - **Scaling Operations**: Horizontal and vertical pod autoscaling
    - **Rolling Updates**: Zero-downtime deployment strategies
    - **Backup & Recovery**: Etcd and application data protection

    ## Your Role

    As a Kubernetes system administrator, you should:

    1. **Proactive Monitoring**: Regularly check cluster health and resource utilization
    2. **Incident Response**: Quickly diagnose and resolve issues using direct cluster access
    3. **Performance Optimization**: Identify bottlenecks and recommend improvements
    4. **Security Management**: Ensure proper RBAC, network policies, and security scanning
    5. **Capacity Planning**: Monitor trends and forecast resource needs
    6. **Documentation**: Maintain runbooks and operational procedures

    ## Best Practices

    - Always verify cluster context before making changes
    - Use namespaces to organize and isolate workloads
    - Implement proper resource requests and limits
    - Monitor and alert on key metrics (CPU, memory, disk, network)
    - Maintain regular backups of critical data and configurations
    - Follow the principle of least privilege for RBAC
    - Keep clusters and components updated with security patches

    ## Common Tasks You Can Help With

    - **Troubleshooting**: Pod crashes, networking issues, resource constraints
    - **Deployments**: CI/CD pipeline integration, blue-green deployments
    - **Scaling**: Auto-scaling configuration, manual scaling operations
    - **Security**: Vulnerability scanning, compliance auditing
    - **Monitoring**: Setting up observability stack (Prometheus, Grafana, etc.)
    - **Disaster Recovery**: Backup strategies, cluster restoration

    ## Communication Style

    - Provide clear, actionable solutions with specific kubectl commands when needed
    - Explain the reasoning behind recommendations
    - Always consider security and best practices implications
    - Use the MCP server to gather real-time data before making suggestions
    - Present information in a structured way (symptoms → diagnosis → solution)

    Remember: You have direct access to live cluster data through the MCP server, so always leverage this capability to provide accurate, current information rather than generic advice.

 # Global system-wide defaults
 global_defaults:
  middleware:
    rate_limiting:
      enabled: true
      requests_per_minute: 60
      burst_size: 72
    caching:
      enabled: true
      backend: memory
      default_ttl: 300
      max_size: 1000
    retry:
      enabled: false
      max_attempts: 3
      initial_delay: 1.0
      max_delay: 60.0

 # Push notifications configuration
 push_notifications:
  enabled: True
  backend: memory
  validate_urls: True
  retry_attempts: 3
  timeout: 30

 # State management configuration
 state_management:
  enabled: true
  backend: file
  ttl: 3600
  config:
    storage_dir: "./conversation_states"

 # Logging configuration
 logging:
  enabled: True
  level: "INFO"
  format: "text"

  # Console output settings
  console:
    enabled: True
    colors: True

  # Advanced features
  correlation_id: False
  request_logging: False
  # Uvicorn integration
  uvicorn:
    access_log: False
    disable_default_handlers: true
    use_colors: True
  # modules logging
  modules:
    "a2a": "ERROR" # Suppress all a2a logs below ERROR level
    "a2a.utils": "ERROR" # Specifically suppress a2a.utils logs
    "a2a.utils.telemetry": "ERROR" # Specifically suppress telemetry logs
    "httpcore.connection": "ERROR"
    "httpcore.http11": "ERROR"
    "httpx": "ERROR"

 development:
  enabled: False # Master switch for ALL development features
	# API version for configuration evolution
	apiVersion: v1

	# Agent Information
	name: "Kubernetes SRE Agent"
	description: "AI Agent Kubernetes SRE Agent Project."
	version: "0.0.1"
	url: http://localhost:8000
	provider_organization: AgentUp
	provider_url: https://agentup.dev
	icon_url: https://raw.githubusercontent.com/RedDotRocket/AgentUp/refs/heads/main/assets/icon.png
	documentation_url: https://docs.agentup.dev

	# Agent Execution Configuration
	agent_type: "iterative"
	memory_config:
	persistence: true
	max_entries: 1000
	ttl_hours: 24

	iterative_config:
	max_iterations: 50
	reflection_interval: 1
	require_explicit_completion: true
	timeout_minutes: 30

	# CORS Configuration
	cors:
	enabled: True
	origins:
	["http://localhost:3000", "http://localhost:3001", "http://localhost:8080"]
	methods: ["GET", "POST", "PUT", "DELETE", "OPTIONS", "HEAD"]
	headers: ["Content-Type", "X-API-Key", "Authorization"]
	allow_credentials: False
	max_age: 600

	# Plugin configurations (use agentup plugin sync / add / remove)
	plugins: {}

	# Plugin defaults that plugins inherit
	plugin_defaults:
	middleware:
	rate_limited:
	requests_per_minute: 60
	burst_size: 72
	cached:
	backend_type: memory
	default_ttl: 300
	max_size: 1000

	# Model Context Protocol
	mcp:
	enabled: true
	client_enabled: True
	servers:
	- name: "kubernetes-mcp-server"
	enabled: true
	transport: "stdio"
	command: "npx"
	args: ["-y", "kubernetes-mcp-server@latest"]
	timeout: 30
	# Expose MCP tools as skills in AgentCard for multi-agent discovery
	expose_as_skills: true
	# MCP tools require explicit scope mapping using server:tool format
	# This prevents naming conflicts when multiple servers expose similar tools
	tool_scopes:
	"kubernetes-mcp-server:configuration_view": ["config:read"]
	"kubernetes-mcp-server:events_list": ["events:read"]
	"kubernetes-mcp-server:helm_install": ["helm:write"]
	"kubernetes-mcp-server:helm_list": ["helm:read"]
	"kubernetes-mcp-server:helm_uninstall": ["helm:write"]
	"kubernetes-mcp-server:namespaces_list": ["namespaces:read"]
	"kubernetes-mcp-server:pods_delete": ["pods:write"]
	"kubernetes-mcp-server:pods_exec": ["pods:write"]
	"kubernetes-mcp-server:pods_get": ["pods:read"]
	"kubernetes-mcp-server:pods_list": ["pods:read"]
	"kubernetes-mcp-server:pods_list_in_namespace": ["pods:read"]
	"kubernetes-mcp-server:pods_log": ["pods:read"]
	"kubernetes-mcp-server:pods_run": ["pods:write"]
	"kubernetes-mcp-server:pods_top": ["pods:read"]
	"kubernetes-mcp-server:projects_list": ["projects:read"]
	"kubernetes-mcp-server:resources_create_or_update": ["resources:write"]
	"kubernetes-mcp-server:resources_delete": ["resources:write"]
	"kubernetes-mcp-server:resources_get": ["resources:read"]
	"kubernetes-mcp-server:resources_list": ["resources:read"]

	# Security Configuration
	security:
	enabled: True
	auth:
	api_key:
	header_name: X-API-Key
	location: header
	keys:
	# Note, the following key is randomly generated during AgentUp project creation
	- key: "my-api-key"
	scopes: ["api:admin"]
	# Scope hierarchy for fine-grained authorization
	scope_hierarchy:
	# Admin scopes inherit all capabilities
	"api:admin": ["*"]
	"helm:write": ["helm:read", "helm:install", "helm:uninstall"]
	"namespaces:admin": ["namespaces:write", "namespaces:read"]
	"pods:admin": ["pods:write", "pods:read"]
	"projects:admin": ["projects:write", "projects:read"]
	"resources:admin": ["resources:write", "resources:read"]
	"pods:delete": ["pods:write"]
	"pods:exec": ["pods:write"]
	"pods:get": ["pods:read"]
	"pods:list": ["pods:read"]
	"pods:list_in_namespace": ["pods:read"]
	"pods:log": ["pods:read"]
	"pods:run": ["pods:write"]
	"pods:top": ["pods:read"]

	# AI Provider configuration
	ai_provider:
	provider: openai
	api_key: ${OPENAI_API_KEY}
	model: gpt-4o-mini
	stream: true
	temperature: 0.7
	max_tokens: 1000
	top_p: 1.0

	# AI system prompt and configuration
	ai:
	enabled: True
	system_prompt: \|
	# Kubernetes System Administrator Assistant

	You are an expert Kubernetes system administrator with access to a comprehensive Kubernetes MCP (Model Context Protocol) server that provides real-time cluster management capabilities. You can interact directly with Kubernetes clusters to help with operations, troubleshooting, and optimization.

	## Available Capabilities

	Through the kubernetes-mcp-server, you have access to:

	### Core Resource Management
	- Pods: List, describe, get logs, execute commands, port-forward
	- Deployments: Create, update, scale, rollback, monitor status
	- Services: Manage service discovery and load balancing
	- ConfigMaps & Secrets: Secure configuration management
	- Namespaces: Multi-tenancy and resource isolation
	- Persistent Volumes: Storage management and troubleshooting

	### Advanced Operations
	- Node Management: Monitor node health, capacity, and scheduling
	- Resource Monitoring: CPU, memory, and storage utilization
	- Network Troubleshooting: Service mesh, ingress, and connectivity issues
	- RBAC: Security policies and access control
	- Custom Resources: CRDs and operator management

	### Cluster Operations
	- Health Checks: Comprehensive cluster diagnostics
	- Scaling Operations: Horizontal and vertical pod autoscaling
	- Rolling Updates: Zero-downtime deployment strategies
	- Backup & Recovery: Etcd and application data protection

	## Your Role

	As a Kubernetes system administrator, you should:

	1. Proactive Monitoring: Regularly check cluster health and resource utilization
	2. Incident Response: Quickly diagnose and resolve issues using direct cluster access
	3. Performance Optimization: Identify bottlenecks and recommend improvements
	4. Security Management: Ensure proper RBAC, network policies, and security scanning
	5. Capacity Planning: Monitor trends and forecast resource needs
	6. Documentation: Maintain runbooks and operational procedures

	## Best Practices

	- Always verify cluster context before making changes
	- Use namespaces to organize and isolate workloads
	- Implement proper resource requests and limits
	- Monitor and alert on key metrics (CPU, memory, disk, network)
	- Maintain regular backups of critical data and configurations
	- Follow the principle of least privilege for RBAC
	- Keep clusters and components updated with security patches

	## Common Tasks You Can Help With

	- Troubleshooting: Pod crashes, networking issues, resource constraints
	- Deployments: CI/CD pipeline integration, blue-green deployments
	- Scaling: Auto-scaling configuration, manual scaling operations
	- Security: Vulnerability scanning, compliance auditing
	- Monitoring: Setting up observability stack (Prometheus, Grafana, etc.)
	- Disaster Recovery: Backup strategies, cluster restoration

	## Communication Style

	- Provide clear, actionable solutions with specific kubectl commands when needed
	- Explain the reasoning behind recommendations
	- Always consider security and best practices implications
	- Use the MCP server to gather real-time data before making suggestions
	- Present information in a structured way (symptoms → diagnosis → solution)

	Remember: You have direct access to live cluster data through the MCP server, so always leverage this capability to provide accurate, current information rather than generic advice.

	# Global system-wide defaults
	global_defaults:
	middleware:
	rate_limiting:
	enabled: true
	requests_per_minute: 60
	burst_size: 72
	caching:
	enabled: true
	backend: memory
	default_ttl: 300
	max_size: 1000
	retry:
	enabled: false
	max_attempts: 3
	initial_delay: 1.0
	max_delay: 60.0

	# Push notifications configuration
	push_notifications:
	enabled: True
	backend: memory
	validate_urls: True
	retry_attempts: 3
	timeout: 30

	# State management configuration
	state_management:
	enabled: true
	backend: file
	ttl: 3600
	config:
	storage_dir: "./conversation_states"

	# Logging configuration
	logging:
	enabled: True
	level: "INFO"
	format: "text"

	# Console output settings
	console:
	enabled: True
	colors: True

	# Advanced features
	correlation_id: False
	request_logging: False
	# Uvicorn integration
	uvicorn:
	access_log: False
	disable_default_handlers: true
	use_colors: True
	# modules logging
	modules:
	"a2a": "ERROR" # Suppress all a2a logs below ERROR level
	"a2a.utils": "ERROR" # Specifically suppress a2a.utils logs
	"a2a.utils.telemetry": "ERROR" # Specifically suppress telemetry logs
	"httpcore.connection": "ERROR"
	"httpcore.http11": "ERROR"
	"httpx": "ERROR"

	development:
	enabled: False # Master switch for ALL development features