ChristianKniep · October 30, 2025 10:44
diff --git a/gistfile1.txt b/gistfile1.txt
 services:
  dcgm:
    container_name: dcgm
    image: nvcr.io/nvidia/k8s/dcgm-exporter:4.4.1-4.6.0-ubuntu22.04
    runtime: nvidia
    privileged: true
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
      - CUDA_VISIBLE_DEVICES=0
      - LOG_LEVEL=debug
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [gpu]
              count: all
    ports:
      - "9400:9400"
    networks:
      - memmachine-network
  node:
    image: quay.io/prometheus/node-exporter:latest
    ports:
      - "9100:9100"
    command:
      - '--collector.disable-defaults'
      - '--collector.cpu'
      - '--collector.meminfo'
      - '--collector.loadavg'
    restart: unless-stopped
    networks:
      - memmachine-network
  prometheus:
    image: prom/prometheus
    volumes:
      - prometheus_data:/prometheus
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--storage.tsdb.path=/prometheus"
      - "--storage.tsdb.retention.time=15d"
    restart: always
    networks:
      - memmachine-network

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    user: "472:0"                      # runs as grafana user to avoid permissions issues
    ports:
      - "3000:3000"                    # http://localhost:3000
    environment:
      GF_SECURITY_ADMIN_USER: admin    # change in production
      GF_SECURITY_ADMIN_PASSWORD: admin
      GF_SERVER_ROOT_URL: "%(protocol)s://%(domain)s/"
      GF_USERS_ALLOW_SIGN_UP: "false"
      # Optional: preinstall plugins (comma-separated)
      # GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-worldmap-panel
    healthcheck:
      test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health | grep -q 'ok'"]
      interval: 10s
      timeout: 3s
      retries: 10
    volumes:
      - grafana-data:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning:ro   # optional: drop datasources/dashboards here
      #- ./grafana/grafana.ini:/etc/grafana/grafana.ini:ro     # optional: custom config
    restart: unless-stopped
    networks:
      - memmachine-network

  ollama:
    container_name: ollama
    image: ollama/ollama:latest
    runtime: nvidia
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
      - CUDA_VISIBLE_DEVICES=0
      - LOG_LEVEL=debug
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [gpu]
              count: all
    volumes:
      - ollama:/root/.ollama
      - models:/models
    ports:
      - "11434:11434"
    networks:
      - memmachine-network
  litellm:
    image: ghcr.io/berriai/litellm:main-stable
    #########################################
    ## Uncomment these lines to start proxy with a config.yaml file ##
    volumes:
     - ./litellm/config.yaml:/app/config.yaml
    command:
     - "--config=/app/config.yaml"
    ##############################################
    ports:
      - "4000:4000" # Map the container port to the host, change the host port if necessary
    environment:
      DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
      STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
      LITELLM_MASTER_KEY: "sk-1234"
      LITELLM_SALT_KEY: "sk-4321"
    depends_on:
      - db  # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
    healthcheck:  # Defines the health check configuration for the container
      test: [ "CMD-SHELL", "wget --no-verbose --tries=1 http://localhost:4000/health/liveliness || exit 1" ]  # Command to execute for health check
      interval: 30s  # Perform health check every 30 seconds
      timeout: 10s   # Health check command times out after 10 seconds
      retries: 3     # Retry up to 3 times if health check fails
      start_period: 40s  # Wait 40 seconds after container start before beginning health checks
    networks:
      - memmachine-network

  db:
    image: postgres:16
    restart: always
    container_name: litellm_db
    environment:
      POSTGRES_DB: litellm
      POSTGRES_USER: llmproxy
      POSTGRES_PASSWORD: dbpassword9090
    #ports:
    #  - "5432:5432"
    volumes:
      - postgres_data:/var/lib/postgresql/data # Persists Postgres data across container restarts
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
      interval: 1s
      timeout: 5s
      retries: 10
    networks:
      - memmachine-network
 networks:
  memmachine-network:
    external: true
 volumes:
  grafana-data:
  prometheus_data:
  postgres_data:
  ollama:
  models:
	services:
	dcgm:
	container_name: dcgm
	image: nvcr.io/nvidia/k8s/dcgm-exporter:4.4.1-4.6.0-ubuntu22.04
	runtime: nvidia
	privileged: true
	environment:
	- NVIDIA_VISIBLE_DEVICES=all
	- NVIDIA_DRIVER_CAPABILITIES=compute,utility
	- CUDA_VISIBLE_DEVICES=0
	- LOG_LEVEL=debug
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	capabilities: [gpu]
	count: all
	ports:
	- "9400:9400"
	networks:
	- memmachine-network
	node:
	image: quay.io/prometheus/node-exporter:latest
	ports:
	- "9100:9100"
	command:
	- '--collector.disable-defaults'
	- '--collector.cpu'
	- '--collector.meminfo'
	- '--collector.loadavg'
	restart: unless-stopped
	networks:
	- memmachine-network
	prometheus:
	image: prom/prometheus
	volumes:
	- prometheus_data:/prometheus
	- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
	ports:
	- "9090:9090"
	command:
	- "--config.file=/etc/prometheus/prometheus.yml"
	- "--storage.tsdb.path=/prometheus"
	- "--storage.tsdb.retention.time=15d"
	restart: always
	networks:
	- memmachine-network

	grafana:
	image: grafana/grafana:latest
	container_name: grafana
	user: "472:0" # runs as grafana user to avoid permissions issues
	ports:
	- "3000:3000" # http://localhost:3000
	environment:
	GF_SECURITY_ADMIN_USER: admin # change in production
	GF_SECURITY_ADMIN_PASSWORD: admin
	GF_SERVER_ROOT_URL: "%(protocol)s://%(domain)s/"
	GF_USERS_ALLOW_SIGN_UP: "false"
	# Optional: preinstall plugins (comma-separated)
	# GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-worldmap-panel
	healthcheck:
	test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health \| grep -q 'ok'"]
	interval: 10s
	timeout: 3s
	retries: 10
	volumes:
	- grafana-data:/var/lib/grafana
	- ./grafana/provisioning:/etc/grafana/provisioning:ro # optional: drop datasources/dashboards here
	#- ./grafana/grafana.ini:/etc/grafana/grafana.ini:ro # optional: custom config
	restart: unless-stopped
	networks:
	- memmachine-network

	ollama:
	container_name: ollama
	image: ollama/ollama:latest
	runtime: nvidia
	environment:
	- NVIDIA_VISIBLE_DEVICES=all
	- NVIDIA_DRIVER_CAPABILITIES=compute,utility
	- CUDA_VISIBLE_DEVICES=0
	- LOG_LEVEL=debug
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	capabilities: [gpu]
	count: all
	volumes:
	- ollama:/root/.ollama
	- models:/models
	ports:
	- "11434:11434"
	networks:
	- memmachine-network
	litellm:
	image: ghcr.io/berriai/litellm:main-stable
	#########################################
	## Uncomment these lines to start proxy with a config.yaml file ##
	volumes:
	- ./litellm/config.yaml:/app/config.yaml
	command:
	- "--config=/app/config.yaml"
	##############################################
	ports:
	- "4000:4000" # Map the container port to the host, change the host port if necessary
	environment:
	DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
	STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
	LITELLM_MASTER_KEY: "sk-1234"
	LITELLM_SALT_KEY: "sk-4321"
	depends_on:
	- db # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
	healthcheck: # Defines the health check configuration for the container
	test: [ "CMD-SHELL", "wget --no-verbose --tries=1 http://localhost:4000/health/liveliness \|\| exit 1" ] # Command to execute for health check
	interval: 30s # Perform health check every 30 seconds
	timeout: 10s # Health check command times out after 10 seconds
	retries: 3 # Retry up to 3 times if health check fails
	start_period: 40s # Wait 40 seconds after container start before beginning health checks
	networks:
	- memmachine-network

	db:
	image: postgres:16
	restart: always
	container_name: litellm_db
	environment:
	POSTGRES_DB: litellm
	POSTGRES_USER: llmproxy
	POSTGRES_PASSWORD: dbpassword9090
	#ports:
	# - "5432:5432"
	volumes:
	- postgres_data:/var/lib/postgresql/data # Persists Postgres data across container restarts
	healthcheck:
	test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
	interval: 1s
	timeout: 5s
	retries: 10
	networks:
	- memmachine-network
	networks:
	memmachine-network:
	external: true
	volumes:
	grafana-data:
	prometheus_data:
	postgres_data:
	ollama:
	models:
No results found