sammcj · May 20, 2024 13:15
diff --git a/.textgen.env b/.textgen.env
 # https://github.com/oobabooga/text-generation-webui/blob/main/README.md
 # https://github.com/oobabooga/text-generation-webui/blob/main/docs/Spell-book.md

 # by default the Dockerfile specifies these versions: 3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX
 # https://developer.nvidia.com/cuda-gpus you can find the version for your card here
 # Tesla P100 = sm_60, sm_61, sm_62 and compute_60, compute_61, compute_62
 # TORCH_CUDA_ARCH_LIST=6.0,6.1,6.2,7.0,7.5,8.0,8.6+PTX
 # RTX3090 = sm_86 and compute_86 (PTX)
 #8.6+PTX
 # Tesla P100 = sm_60, sm_61, sm_62 and compute_60, compute_61, compute_62
 # RTX 3090 = sm_86 and compute_86, and +PTX
 # +PTX which is an intermediate representation that allows kernels to runtime-compile for any CC >= the specified CC with a performance penalty
 TORCH_CUDA_ARCH_LIST='8.6;8.6+PTX'

 # the model and extensions are now managed in textgen_settings.yaml
 CLI_ARGS=--listen --listen-host 0.0.0.0 --rwkv-cuda-on --xformers --extensions api openai

 # txt2image api
 SD_WEBUI_URL=http://0.0.0.0:7861

 # OpenAI api = http://0.0.0.0:5001/v1
 OPENEDAI_PORT=5001
 # https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai

 LISTEN_HOST=0.0.0.0

 HOST_PORT=7860
 CONTAINER_PORT=7860 # main webui

 # the port the api blocking endpoint binds to on the host
 BLOCKING_PORT=5001 # openai api
 HOST_BLOCKING_PORT=5001

 # the port the api binds to on the host
 HOST_API_PORT=7861 # textgen api
 CONTAINER_API_PORT=7861

 # the port the api stream endpoint binds to on the host
 HOST_API_STREAM_PORT=5005 # Textgen streaming api
 HOST_STREAMING_PORT=5005
 CONTAINER_API_STREAM_PORT=5005
 STREAMING_PORT=5005

 # the version used to install text-generation-webui from
 WEBUI_VERSION=HEAD
 INSTALL_EXTENSIONS=true

 # maybe -march=native?
 CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_CUBLAS=on -DLLAMA_CLBLAST=on -DLLAMA_HIPBLAS=on -DLLAMA_F16C=on -DLLAMA_AVX512=on -DLLAMA_AVX2=on -DLLAMA_FMA=on"
 GPU_FLAGS=all

 GPUS=all
 FORCE_CMAKE=1
 WEBUI_VERSION=HEAD
 FORCE_CUDA=1
diff --git a/.willow.env b/.willow.env
 # Which docker image to run
 IMAGE=${IMAGE:-willow-inference-server}

 # HTTPS Listen port
 LISTEN_PORT_HTTPS=${LISTEN_PORT_HTTPS:-19001}

 # Listen port
 LISTEN_PORT=${LISTEN_PORT:-19000}

 # Media port range
 # WebRTC dynamically negotiates UDP ports for each session
 # You should keep this as small as possible for expected WebRTC connectionsMEDIA_PORT_RANGE=${MEDIA_PORT_RANGE:-10000-10050}

 # Listen IP
 LISTEN_IP=${LISTEN_IP:-0.0.0.0}

 # GPUS - WIP for docker compose
 GPUS="all"

 # Allow forwarded IPs. This is a list of hosts to allow parsing of X-Forwarded headers from
 # FORWARDED_ALLOW_IPS=
 # allow all
 FORWARDED_ALLOW_IPS="*"

 # Shared memory size for docker
 SHM_SIZE=1g

 # Docker image tag
 TAG=latest
 NAME=wis

 # c2translate config options
 CT2_VERBOSE=0
 QUANT="float16"
 container="docker"

 # Log level - acceptable values are debug, info, warning, error, critical. Suggest info or debug.
 LOG_LEVEL=${LOG_LEVEL:-warning}

 # Media port range
 # WebRTC dynamically negotiates UDP ports for each session
 # You should keep this as small as possible for expected WebRTC connections
 MEDIA_PORT_RANGE=${MEDIA_PORT_RANGE:-10000-10050}

 ### TBC if these work as envs ###
 # The default whisper model to use. Options are "tiny", "base", "small", "medium", "large"
 WHISPER_MODEL_DEFAULT="medium"
diff --git a/custom_settings.py b/custom_settings.py
 from functools import lru_cache
 from pydantic import BaseSettings
 from typing import List


 class APISettings(BaseSettings):
    # # Project metadata
    # name: str = "Willow Inference Server"
    # description: str = "High Performance Language Inference API"
    # version: str = "1.0"

    # # Note: More beams is more accurate but slower.
    # # default beam_size - 5 is lib default, 1 for greedy
    # beam_size: int = 1
    # # default beam size for longer transcriptions
    # long_beam_size: int = 3
    # # Audio duration in ms to activate "long" mode. Any audio longer than this will use long_beam_size.
    # long_beam_size_threshold: int = 12000
    # model_threads: int = 10

    # # Default language
    # language: str = "en"

    # # Default detect language?
    # detect_language: bool = False

    # # if False, load models only on first use
    # # this saves GPU ram but costs latency on first calls
    preload_all_models: bool = False

    # # Models to preload
    # # if preload_all_models is True, these are irrelevant
    # preload_whisper_model_tiny = True
    # preload_whisper_model_base = True
    # preload_whisper_model_small = True
    preload_whisper_model_medium = True
    # preload_whisper_model_large = True
    preload_chatbot_model = True  # only used if support_chatbot is True too
    preload_tts_model = True  # only used if support_tts is True too

    # # TTS CUDA memory threshold - equivalent of 4GB GPUs
    # tts_memory_threshold: int = 3798205849

    # # SV CUDA memory threshold - equivalent of 6GB GPUs
    # sv_memory_threshold: int = 5798205849

    # # Enable chunking support
    # support_chunking: bool = True

    # # There is really no reason to disable chunking anymore
    # # But if you still want to, you can set this threshold higher
    # # current value is equivalent of 4GB GPUs
    # chunking_memory_threshold: int = 3798205849

    # # Maximum number of chunks that are loaded into the GPU at once
    # # This will need to be tweaked based on GPU ram and model used.
    # # 8GB GPUs should support at least 2 chunks so starting with that
    # concurrent_gpu_chunks: int = 2

    # # Enable TTS
    support_tts: bool = True

    # # Enable SV
    # support_sv: bool = False

    # # SV threshold
    # sv_threshold: float = 0.75

    # # The default whisper model to use. Options are "tiny", "base", "small", "medium", "large"
    whisper_model_default: str = 'medium'

    # # Default TTS format to use
    # tts_default_format: str = "FLAC"

    # # Default TTS speaker to use. CLB is US female
    # tts_default_speaker: str = "CLB"

    # # List of allowed origins for WebRTC. See https://fastapi.tiangolo.com/tutorial/cors/#use-corsmiddleware
    # cors_allowed_origins: List[str] = []

    # # If basic_auth_pass or basic_auth_user are set all endpoints are guarded by basic auth
    # # If basic_auth_user is falsy it will not be checked. If basic_auth_pass is falsy it will not be checked.
    # basic_auth_user: str = None
    # basic_auth_pass: str = None

    # # Support chatbot
    support_chatbot: bool = True

    # # Path to chatbot model - download from HuggingFace at runtime by default (gets cached)
    chatbot_model_path: str = 'TheBloke/vicuna-13B-v1.5-GPTQ'

    # # Chatbot pipeline default temperature
    # chatbot_temperature: float = 0.7

    # # Chatbot pipeline default top_p
    # chatbot_top_p: float = 0.95

    # # Chatbot pipeline default repetition penalty
    # chatbot_repetition_penalty: float = 1.15

    # # Chatbot pipeline default max new tokens
    chatbot_max_new_tokens: int = 2048

    # # airotc debug for connectivity and other WebRTC debugging
    # aiortc_debug: bool = False


    class Config:
        env_prefix = ""
        case_sensitive = False


    @lru_cache()
    def get_api_settings() -> APISettings:
        return APISettings()  # reads variables from environment
diff --git a/docker-compose.yaml b/docker-compose.yaml
 ---
 ### Willow / Textgen / Home Assistant Docker Compose ###
 #
 # WARNING: this is an EXAMPLE ONLY, and will not work as-is!
 #
 # Traefik is optional, but some people have asked how I use it.
 # I have a separate docker-compose file for traefik v3 which is pretty specific to my setup.
 # I've probably missed a bunch of things, but you get the idea hopefully and I'll do a proper blog post at some point.
 #
 ###

 ### YAML Anchors ###
 x-gpu: &gpu
  runtime: nvidia
  deploy:
    resources:
      reservations:
        devices:
          - driver: nvidia
            count: all
            capabilities: ["compute", "utility", "graphics"]

 x-restart: &restart
  restart: unless-stopped

 x-secopts: &secopts
  security_opt:
    - no-new-privileges:true

 x-ai-common: &ai-common
  <<: [ *restart, *secopts]
  environment:
    - PUID=${PUID:-1001}
    - PGID=${PGID:-1001}

 environment:
  wis_git: /home/username/git/wis
  wis_config: /home/username/wis_config
  textgen_models: /home/username/llm_models
  textgen_config: /home/username/textgen_config
  ha_config: /home/username/ha_config

 volumes:
  cache:
  wis:
  textgen:

 networks:
  willow:
  external: true # docker network create willow
  name: willow

 ####
 services:
  &name willow: # WAS - willow application server
    container_name: *name
    env_file:
      - .willow.env
    hostname: *name
    <<: [*ai-common, *gpu]
    image: ghcr.io/toverainc/willow-application-server:latest
    shm_size: 1g
    ipc: host # https://docs.docker.com/engine/reference/run/#ipc-settings---ipc
    ulimits:
      memlock: -1
      stack: 67108864
    ports:
      - 8502 # optionally remove if using traefik
    volumes:
      - cache:/root/.cache
    networks:
      - willow
    # labels:
    #   traefik.enable: true
    #   traefik.http.routers.willow.rule: Host(`willow.your.domain`)
    #   traefik.http.routers.willow.entrypoints: websecure
    #   traefik.http.routers.willow.tls: true
    #   traefik.http.routers.willow.tls.certresolver: le
    #   traefik.http.routers.willow.service: willow-service
    #   traefik.http.services.willow-service.loadbalancer.server.port: 8502

  &name wis-nginx:
    container_name: *name
    env_file:
      - .willow.env
    hostname: *name
    <<: [*ai-common, *limits-mem-512]
    depends_on:
      - wis
    image: nginx:latest
    volumes:
      - ${wis_git}/nginx:/nginx
    environment:
      - NGINX_ENTRYPOINT_WORKER_PROCESSES_AUTOTUNE=1
    ports:
      - 19000 # optionally remove if using traefik
      - 19001 # optionally remove if using traefik
    networks:
      - willow
    # labels:
    #   traefik.enable: true
    #   traefik.http.routers.wis-nginx.rule: Host(`wis-nginx.your.domain`)
    #   traefik.http.routers.wis-nginx.tls.certresolver: le
    #   traefik.http.routers.wis-nginx.entrypoints: websecure
    #   traefik.http.routers.wis-nginx.tls.domains[0].main: "*.your.domain"
    #   traefik.http.routers.wis-nginx.service: wis-nginx-service
    #   traefik.http.services.wis-nginx-service.loadbalancer.server.port: 19001

  ### Willow Inference Server for TTS ###
  &name wis:
    container_name: *name
    env_file:
      - .willow.env
    hostname: *name
    <<: [*ai-common, *gpu]
    profiles:
      - *name
    image: willow-inference-server:latest
    shm_size: 1g
    ipc: host # https://docs.docker.com/engine/reference/run/#ipc-settings---ipc
    ulimits:
      memlock: -1
      stack: 67108864
    build:
      context: https://github.com/toverainc/willow-inference-server.git#main
      dockerfile: Dockerfile
    ports:
      - 0.0.0.0:10002-10050:10002-10050 # optionally remove if using traefik
      - 19000:19000 # optionally remove if using traefik
      - 19001:19001 # optionally remove if using traefik
    volumes:
      - cache:/root/.cache
      - ${willow_config}/custom_settings.py:/app/custom_settings.py:ro
      - ${wis_git}:/app
    command: ./entrypoint.sh
    networks:
      - willow
    # labels:
    #   TODO: my traefik setup for the WIS is missing a few things that would allow the port mappings to be removed
    #   traefik.enable: true
    #   traefik.http.routers.wis.rule: Host(`wis.your.domain`)
    #   traefik.http.routers.wis.entrypoints: websecure
    #   traefik.http.routers.wis.tls: true
    #   traefik.http.routers.wis.tls.certresolver: le
    #   traefik.http.routers.wis.tls.domains[0].main: "*.your.domain"
    #   traefik.http.services.wis-service.loadbalancer.server.port: 19000

    #   # Middleware for Buffering (Traefik doesn't support disabling buffering, but you can set limits)
    #   traefik.http.middlewares.wis-buffering.buffering.maxRequestBodyBytes: 104857600 # 100MB
    #   traefik.http.middlewares.wis-buffering.buffering.maxResponseBodyBytes: 104857600 # 100MB

    #   # Apply Middlewares
    #   traefik.http.routers.wis.middlewares: wis-buffering #wis-headers


  &name textgen:
    container_name: *name
    env_file:
      - .textgen.env
    hostname: *name
    <<: [*ai-common, *gpu]
    build:
      context: https://github.com/oobabooga/text-generation-webui.git#main
      dockerfile: docker/Dockerfile
      tags:
        - docker.io/yourusername/textgen:latest #latest
      shm_size: "8gb"
      args:
        # Tesla P100 = sm_60, sm_61, sm_62 and compute_60, compute_61, compute_62
        # RTX 3090 = sm_86 and compute_86
        - "TORCH_CUDA_ARCH_LIST=8.6;8.6+PTX"
        - GPUS=all
        - TAG=docker.io/yourusername/textgen:latest #latest #cuda12
        - CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_CUBLAS=on -DLLAMA_CLBLAST=on -DLLAMA_HIPBLAS=on -DLLAMA_F16C=on -DLLAMA_AVX512=on -DLLAMA_AVX2=on -DLLAMA_FMA=on"
        - FORCE_CMAKE=1
        - WEBUI_VERSION=HEAD
        - FORCE_CUDA=1
    security_opt:
      - seccomp:unconfined
    stdin_open: true
    tty: true
    shm_size: "16gb"
    networks:
      - willow
    ports:
      - "7860:7860" # Gradio Web UI
      - "7861:7861" # Textgen API (blocking)
      - "5001:5001" # OpenAI compatible API (https://localai.your.domain/v1)
      - "5005:5005" # Textgen Websockets API (Stream)

    volumes:
      - cache:/root/.cache:rw
      - ${textgen_models}:/app/models:cached
      - ${textgen_config}/textgen_settings.yaml:/app/settings.yaml

    # labels:
    #   traefik.enable: true
    #   traefik.http.routers.textgen.rule: Host(`textgen.your.domain`) || Host(`textgen`)
    #   traefik.http.routers.textgen.tls.certresolver: le
    #   traefik.http.routers.textgen.entrypoints: websecure
    #   traefik.http.routers.textgen.tls.domains[0].main: "*.your.domain"
    #   traefik.http.routers.textgen.service: textgen-service
    #   traefik.http.services.textgen-service.loadbalancer.server.port: 7860 # Web UI

    #   ### Stablediff
    #   traefik.http.routers.textgen-stablediff.rule: Host(`textgen.your.domain`) || Host(`textgen`) || Host(`textgen-stablediff.your.domain`)
    #   traefik.http.routers.textgen-stablediff.tls.certresolver: le
    #   traefik.http.routers.textgen-stablediff.entrypoints: stablediff # port 7861
    #   traefik.http.routers.textgen-stablediff.tls.domains[0].main: "*.your.domain"
    #   traefik.http.routers.textgen-stablediff.service: textgen-stablediff-service
    #   traefik.http.services.textgen-stablediff-service.loadbalancer.server.port: 7861 # Stable diffusion (and openai embedding?)

    #   ### Textgen API (blocking) 'http://textgen.your.domain:5000/api/v1/chat'
    #   traefik.http.routers.textgen-api-blocking.rule: Host(`textgen.your.domain`) || Host(`textgen`) || Host(`textgen-api-blocking.your.domain`)
    #   traefik.http.routers.textgen-api-blocking.tls.certresolver: le
    #   traefik.http.routers.textgen-api-blocking.entrypoints: textgenapi # port 5000
    #   traefik.http.routers.textgen-api-blocking.tls.domains[0].main: "*.your.domain"
    #   traefik.http.routers.textgen-api-blocking.service: textgen-api-blocking-service
    #   traefik.http.services.textgen-api-blocking-service.loadbalancer.server.port: 5000 # Textgen API (blocking)

    #   ### OpenAI compatible API 'https://openai.your.domain/v1'
    #   traefik.http.routers.openai.rule: Host(`openai.your.domain`) || Host(`openai`)
    #   traefik.http.routers.openai.tls.certresolver: le
    #   traefik.http.routers.openai.entrypoints: websecure # port 5001
    #   traefik.http.routers.openai.tls.domains[0].main: "*.your.domain"
    #   traefik.http.routers.openai.service: openai-service
    #   traefik.http.services.openai-service.loadbalancer.server.port: 5001 # OpenAI compatible API (https://openai.your.domain/v1)

    #   ### Textgen Websockets API (Stream) 'ws://textgen.your.domain:5005/api/v1/chat-stream'
    #   traefik.http.routers.textgen-api-ws.rule: Host(`textgen.your.domain`) || Host(`textgen`) || Host(`textgen-api-ws.your.domain`)
    #   traefik.http.routers.textgen-api-ws.tls.certresolver: le
    #   traefik.http.routers.textgen-api-ws.entrypoints: websockets # port 5005
    #   traefik.http.routers.textgen-api-ws.tls.domains[0].main: "*.your.domain"
    #   traefik.http.routers.textgen-api-ws.service: textgen-api-ws-service
    #   # traefik.http.routers.textgen-api-ws-service.middlewares: websocketsSSL
    #   traefik.http.services.textgen-api-ws-service.loadbalancer.server.port: 5005 # Textgen API (Stream)

  &name homeassistant:
    <<: [ *restart, *secopts]
    container_name: *name
    image: "ghcr.io/home-assistant/home-assistant:stable"
    volumes:
      - ${ha_config}:/config
      - DOCKER_MODS=linuxserver/mods:homeassistant-hacs
    network_mode: host # optionally remove if using traefik
    # cap_add:
    #   - CAP_NET_RAW # optionally remove if using traefik
    #   - CAP_NET_BIND_SERVICE # optionally remove if using traefik
    #devices:
    #   # - /dev/ttyACM0:/dev/ttyACM0
    # - /dev/ttyUSB0:/dev/ttyUSB0
    # - /dev/zigbee:/dev/zigbee
    # - /dev/hidraw0:/dev/hidraw0
    # - /dev/hidraw1:/dev/hidraw1
    # - /dev/hidraw2:/dev/hidraw2
    #- /dev/usb/hiddev0:/dev/usb/hiddev0
    # - /dev/hci0:/dev/hci0
    # - /sys/class/bluetooth/hci0:/sys/class/bluetooth/hci0
    # labels:
    #   traefik.enable: true
    #   traefik.http.routers.ha.rule: "Host(`ha.your.domain`) || Host(`homeassistant.your.domain`) || Host(`homeassistant.your.domain`) || Host(`hass.your.domain`)"
    #   traefik.http.routers.ha.tls.certresolver: le
    #   traefik.http.routers.ha.entrypoints: websecure
    #   traefik.http.routers.ha.tls.domains[0].main: "*.your.domain"
    #   traefik.http.routers.ha.service: ha
    #   traefik.http.services.ha-service.loadbalancer.server.port: 8123
diff --git a/textgen_settings.yaml b/textgen_settings.yaml
 dark_theme: false
 show_controls: true
 start_with: ''
 mode: chat
 chat_style: cai-chat
 preset: Divine Intellect
 max_new_tokens: 2048
 max_new_tokens_min: 1
 max_new_tokens_max: 4096
 seed: -1
 negative_prompt: ''
 truncation_length: 32768
 truncation_length_min: 0
 truncation_length_max: 16384
 custom_stopping_strings: ''
 auto_max_new_tokens: true
 max_tokens_second: 0
 ban_eos_token: false
 custom_token_bans: ''
 add_bos_token: false
 skip_special_tokens: true
 stream: true
 name1: You
 character: Assistant
 instruction_template: Llama-v2
 chat-instruct_command: ''
 autoload_model: true
 default_extensions:
 - api
 - openai
 name2: Assistant
 context: This is a conversation with your Assistant designed to help with various tasks such as answering questions, providing recommendations, improving code and helping with decision making. You can ask it anything you want and it will do its best to give you accurate and relevant information.
 greeting: null
 turn_template: null
 stop_at_newline: false
 chat_generation_attempts: 1
 chat_generation_attempts_min: 1
 chat_generation_attempts_max: 10
 chat_default_extensions:
 - gallery
 prompt: QA
 code_syntax_highlight-activate: true
 code_syntax_highlight-inline_highlight: true
 code_syntax_highlight-copy_button: true
 model: llava-v1.5-13b
 multimodal_pipeline: llava-llama-2-13b
 openai-port: 5001
 openai-embedding_device: cuda
 openai-sd_webui_url: http://127.0.0.1:7861
 openai-debug: 1
	# https://github.com/oobabooga/text-generation-webui/blob/main/README.md
	# https://github.com/oobabooga/text-generation-webui/blob/main/docs/Spell-book.md

	# by default the Dockerfile specifies these versions: 3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX
	# https://developer.nvidia.com/cuda-gpus you can find the version for your card here
	# Tesla P100 = sm_60, sm_61, sm_62 and compute_60, compute_61, compute_62
	# TORCH_CUDA_ARCH_LIST=6.0,6.1,6.2,7.0,7.5,8.0,8.6+PTX
	# RTX3090 = sm_86 and compute_86 (PTX)
	#8.6+PTX
	# Tesla P100 = sm_60, sm_61, sm_62 and compute_60, compute_61, compute_62
	# RTX 3090 = sm_86 and compute_86, and +PTX
	# +PTX which is an intermediate representation that allows kernels to runtime-compile for any CC >= the specified CC with a performance penalty
	TORCH_CUDA_ARCH_LIST='8.6;8.6+PTX'

	# the model and extensions are now managed in textgen_settings.yaml
	CLI_ARGS=--listen --listen-host 0.0.0.0 --rwkv-cuda-on --xformers --extensions api openai

	# txt2image api
	SD_WEBUI_URL=http://0.0.0.0:7861

	# OpenAI api = http://0.0.0.0:5001/v1
	OPENEDAI_PORT=5001
	# https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai

	LISTEN_HOST=0.0.0.0

	HOST_PORT=7860
	CONTAINER_PORT=7860 # main webui

	# the port the api blocking endpoint binds to on the host
	BLOCKING_PORT=5001 # openai api
	HOST_BLOCKING_PORT=5001

	# the port the api binds to on the host
	HOST_API_PORT=7861 # textgen api
	CONTAINER_API_PORT=7861

	# the port the api stream endpoint binds to on the host
	HOST_API_STREAM_PORT=5005 # Textgen streaming api
	HOST_STREAMING_PORT=5005
	CONTAINER_API_STREAM_PORT=5005
	STREAMING_PORT=5005

	# the version used to install text-generation-webui from
	WEBUI_VERSION=HEAD
	INSTALL_EXTENSIONS=true

	# maybe -march=native?
	CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_CUBLAS=on -DLLAMA_CLBLAST=on -DLLAMA_HIPBLAS=on -DLLAMA_F16C=on -DLLAMA_AVX512=on -DLLAMA_AVX2=on -DLLAMA_FMA=on"
	GPU_FLAGS=all

	GPUS=all
	FORCE_CMAKE=1
	WEBUI_VERSION=HEAD
	FORCE_CUDA=1
	# Which docker image to run
	IMAGE=${IMAGE:-willow-inference-server}

	# HTTPS Listen port
	LISTEN_PORT_HTTPS=${LISTEN_PORT_HTTPS:-19001}

	# Listen port
	LISTEN_PORT=${LISTEN_PORT:-19000}

	# Media port range
	# WebRTC dynamically negotiates UDP ports for each session
	# You should keep this as small as possible for expected WebRTC connectionsMEDIA_PORT_RANGE=${MEDIA_PORT_RANGE:-10000-10050}

	# Listen IP
	LISTEN_IP=${LISTEN_IP:-0.0.0.0}

	# GPUS - WIP for docker compose
	GPUS="all"

	# Allow forwarded IPs. This is a list of hosts to allow parsing of X-Forwarded headers from
	# FORWARDED_ALLOW_IPS=
	# allow all
	FORWARDED_ALLOW_IPS="*"

	# Shared memory size for docker
	SHM_SIZE=1g

	# Docker image tag
	TAG=latest
	NAME=wis

	# c2translate config options
	CT2_VERBOSE=0
	QUANT="float16"
	container="docker"

	# Log level - acceptable values are debug, info, warning, error, critical. Suggest info or debug.
	LOG_LEVEL=${LOG_LEVEL:-warning}

	# Media port range
	# WebRTC dynamically negotiates UDP ports for each session
	# You should keep this as small as possible for expected WebRTC connections
	MEDIA_PORT_RANGE=${MEDIA_PORT_RANGE:-10000-10050}

	### TBC if these work as envs ###
	# The default whisper model to use. Options are "tiny", "base", "small", "medium", "large"
	WHISPER_MODEL_DEFAULT="medium"
	from functools import lru_cache
	from pydantic import BaseSettings
	from typing import List


	class APISettings(BaseSettings):
	# # Project metadata
	# name: str = "Willow Inference Server"
	# description: str = "High Performance Language Inference API"
	# version: str = "1.0"

	# # Note: More beams is more accurate but slower.
	# # default beam_size - 5 is lib default, 1 for greedy
	# beam_size: int = 1
	# # default beam size for longer transcriptions
	# long_beam_size: int = 3
	# # Audio duration in ms to activate "long" mode. Any audio longer than this will use long_beam_size.
	# long_beam_size_threshold: int = 12000
	# model_threads: int = 10

	# # Default language
	# language: str = "en"

	# # Default detect language?
	# detect_language: bool = False

	# # if False, load models only on first use
	# # this saves GPU ram but costs latency on first calls
	preload_all_models: bool = False

	# # Models to preload
	# # if preload_all_models is True, these are irrelevant
	# preload_whisper_model_tiny = True
	# preload_whisper_model_base = True
	# preload_whisper_model_small = True
	preload_whisper_model_medium = True
	# preload_whisper_model_large = True
	preload_chatbot_model = True # only used if support_chatbot is True too
	preload_tts_model = True # only used if support_tts is True too

	# # TTS CUDA memory threshold - equivalent of 4GB GPUs
	# tts_memory_threshold: int = 3798205849

	# # SV CUDA memory threshold - equivalent of 6GB GPUs
	# sv_memory_threshold: int = 5798205849

	# # Enable chunking support
	# support_chunking: bool = True

	# # There is really no reason to disable chunking anymore
	# # But if you still want to, you can set this threshold higher
	# # current value is equivalent of 4GB GPUs
	# chunking_memory_threshold: int = 3798205849

	# # Maximum number of chunks that are loaded into the GPU at once
	# # This will need to be tweaked based on GPU ram and model used.
	# # 8GB GPUs should support at least 2 chunks so starting with that
	# concurrent_gpu_chunks: int = 2

	# # Enable TTS
	support_tts: bool = True

	# # Enable SV
	# support_sv: bool = False

	# # SV threshold
	# sv_threshold: float = 0.75

	# # The default whisper model to use. Options are "tiny", "base", "small", "medium", "large"
	whisper_model_default: str = 'medium'

	# # Default TTS format to use
	# tts_default_format: str = "FLAC"

	# # Default TTS speaker to use. CLB is US female
	# tts_default_speaker: str = "CLB"

	# # List of allowed origins for WebRTC. See https://fastapi.tiangolo.com/tutorial/cors/#use-corsmiddleware
	# cors_allowed_origins: List[str] = []

	# # If basic_auth_pass or basic_auth_user are set all endpoints are guarded by basic auth
	# # If basic_auth_user is falsy it will not be checked. If basic_auth_pass is falsy it will not be checked.
	# basic_auth_user: str = None
	# basic_auth_pass: str = None

	# # Support chatbot
	support_chatbot: bool = True

	# # Path to chatbot model - download from HuggingFace at runtime by default (gets cached)
	chatbot_model_path: str = 'TheBloke/vicuna-13B-v1.5-GPTQ'

	# # Chatbot pipeline default temperature
	# chatbot_temperature: float = 0.7

	# # Chatbot pipeline default top_p
	# chatbot_top_p: float = 0.95

	# # Chatbot pipeline default repetition penalty
	# chatbot_repetition_penalty: float = 1.15

	# # Chatbot pipeline default max new tokens
	chatbot_max_new_tokens: int = 2048

	# # airotc debug for connectivity and other WebRTC debugging
	# aiortc_debug: bool = False


	class Config:
	env_prefix = ""
	case_sensitive = False


	@lru_cache()
	def get_api_settings() -> APISettings:
	return APISettings() # reads variables from environment
	---
	### Willow / Textgen / Home Assistant Docker Compose ###
	#
	# WARNING: this is an EXAMPLE ONLY, and will not work as-is!
	#
	# Traefik is optional, but some people have asked how I use it.
	# I have a separate docker-compose file for traefik v3 which is pretty specific to my setup.
	# I've probably missed a bunch of things, but you get the idea hopefully and I'll do a proper blog post at some point.
	#
	###

	### YAML Anchors ###
	x-gpu: &gpu
	runtime: nvidia
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: ["compute", "utility", "graphics"]

	x-restart: &restart
	restart: unless-stopped

	x-secopts: &secopts
	security_opt:
	- no-new-privileges:true

	x-ai-common: &ai-common
	<<: [ restart, secopts]
	environment:
	- PUID=${PUID:-1001}
	- PGID=${PGID:-1001}

	environment:
	wis_git: /home/username/git/wis
	wis_config: /home/username/wis_config
	textgen_models: /home/username/llm_models
	textgen_config: /home/username/textgen_config
	ha_config: /home/username/ha_config

	volumes:
	cache:
	wis:
	textgen:

	networks:
	willow:
	external: true # docker network create willow
	name: willow

	####
	services:
	&name willow: # WAS - willow application server
	container_name: *name
	env_file:
	- .willow.env
	hostname: *name
	<<: [ai-common, gpu]
	image: ghcr.io/toverainc/willow-application-server:latest
	shm_size: 1g
	ipc: host # https://docs.docker.com/engine/reference/run/#ipc-settings---ipc
	ulimits:
	memlock: -1
	stack: 67108864
	ports:
	- 8502 # optionally remove if using traefik
	volumes:
	- cache:/root/.cache
	networks:
	- willow
	# labels:
	# traefik.enable: true
	# traefik.http.routers.willow.rule: Host(`willow.your.domain`)
	# traefik.http.routers.willow.entrypoints: websecure
	# traefik.http.routers.willow.tls: true
	# traefik.http.routers.willow.tls.certresolver: le
	# traefik.http.routers.willow.service: willow-service
	# traefik.http.services.willow-service.loadbalancer.server.port: 8502

	&name wis-nginx:
	container_name: *name
	env_file:
	- .willow.env
	hostname: *name
	<<: [ai-common, limits-mem-512]
	depends_on:
	- wis
	image: nginx:latest
	volumes:
	- ${wis_git}/nginx:/nginx
	environment:
	- NGINX_ENTRYPOINT_WORKER_PROCESSES_AUTOTUNE=1
	ports:
	- 19000 # optionally remove if using traefik
	- 19001 # optionally remove if using traefik
	networks:
	- willow
	# labels:
	# traefik.enable: true
	# traefik.http.routers.wis-nginx.rule: Host(`wis-nginx.your.domain`)
	# traefik.http.routers.wis-nginx.tls.certresolver: le
	# traefik.http.routers.wis-nginx.entrypoints: websecure
	# traefik.http.routers.wis-nginx.tls.domains[0].main: "*.your.domain"
	# traefik.http.routers.wis-nginx.service: wis-nginx-service
	# traefik.http.services.wis-nginx-service.loadbalancer.server.port: 19001

	### Willow Inference Server for TTS ###
	&name wis:
	container_name: *name
	env_file:
	- .willow.env
	hostname: *name
	<<: [ai-common, gpu]
	profiles:
	- *name
	image: willow-inference-server:latest
	shm_size: 1g
	ipc: host # https://docs.docker.com/engine/reference/run/#ipc-settings---ipc
	ulimits:
	memlock: -1
	stack: 67108864
	build:
	context: https://github.com/toverainc/willow-inference-server.git#main
	dockerfile: Dockerfile
	ports:
	- 0.0.0.0:10002-10050:10002-10050 # optionally remove if using traefik
	- 19000:19000 # optionally remove if using traefik
	- 19001:19001 # optionally remove if using traefik
	volumes:
	- cache:/root/.cache
	- ${willow_config}/custom_settings.py:/app/custom_settings.py:ro
	- ${wis_git}:/app
	command: ./entrypoint.sh
	networks:
	- willow
	# labels:
	# TODO: my traefik setup for the WIS is missing a few things that would allow the port mappings to be removed
	# traefik.enable: true
	# traefik.http.routers.wis.rule: Host(`wis.your.domain`)
	# traefik.http.routers.wis.entrypoints: websecure
	# traefik.http.routers.wis.tls: true
	# traefik.http.routers.wis.tls.certresolver: le
	# traefik.http.routers.wis.tls.domains[0].main: "*.your.domain"
	# traefik.http.services.wis-service.loadbalancer.server.port: 19000

	# # Middleware for Buffering (Traefik doesn't support disabling buffering, but you can set limits)
	# traefik.http.middlewares.wis-buffering.buffering.maxRequestBodyBytes: 104857600 # 100MB
	# traefik.http.middlewares.wis-buffering.buffering.maxResponseBodyBytes: 104857600 # 100MB

	# # Apply Middlewares
	# traefik.http.routers.wis.middlewares: wis-buffering #wis-headers


	&name textgen:
	container_name: *name
	env_file:
	- .textgen.env
	hostname: *name
	<<: [ai-common, gpu]
	build:
	context: https://github.com/oobabooga/text-generation-webui.git#main
	dockerfile: docker/Dockerfile
	tags:
	- docker.io/yourusername/textgen:latest #latest
	shm_size: "8gb"
	args:
	# Tesla P100 = sm_60, sm_61, sm_62 and compute_60, compute_61, compute_62
	# RTX 3090 = sm_86 and compute_86
	- "TORCH_CUDA_ARCH_LIST=8.6;8.6+PTX"
	- GPUS=all
	- TAG=docker.io/yourusername/textgen:latest #latest #cuda12
	- CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_CUBLAS=on -DLLAMA_CLBLAST=on -DLLAMA_HIPBLAS=on -DLLAMA_F16C=on -DLLAMA_AVX512=on -DLLAMA_AVX2=on -DLLAMA_FMA=on"
	- FORCE_CMAKE=1
	- WEBUI_VERSION=HEAD
	- FORCE_CUDA=1
	security_opt:
	- seccomp:unconfined
	stdin_open: true
	tty: true
	shm_size: "16gb"
	networks:
	- willow
	ports:
	- "7860:7860" # Gradio Web UI
	- "7861:7861" # Textgen API (blocking)
	- "5001:5001" # OpenAI compatible API (https://localai.your.domain/v1)
	- "5005:5005" # Textgen Websockets API (Stream)

	volumes:
	- cache:/root/.cache:rw
	- ${textgen_models}:/app/models:cached
	- ${textgen_config}/textgen_settings.yaml:/app/settings.yaml

	# labels:
	# traefik.enable: true
	# traefik.http.routers.textgen.rule: Host(`textgen.your.domain`) \|\| Host(`textgen`)
	# traefik.http.routers.textgen.tls.certresolver: le
	# traefik.http.routers.textgen.entrypoints: websecure
	# traefik.http.routers.textgen.tls.domains[0].main: "*.your.domain"
	# traefik.http.routers.textgen.service: textgen-service
	# traefik.http.services.textgen-service.loadbalancer.server.port: 7860 # Web UI

	# ### Stablediff
	# traefik.http.routers.textgen-stablediff.rule: Host(`textgen.your.domain`) \|\| Host(`textgen`) \|\| Host(`textgen-stablediff.your.domain`)
	# traefik.http.routers.textgen-stablediff.tls.certresolver: le
	# traefik.http.routers.textgen-stablediff.entrypoints: stablediff # port 7861
	# traefik.http.routers.textgen-stablediff.tls.domains[0].main: "*.your.domain"
	# traefik.http.routers.textgen-stablediff.service: textgen-stablediff-service
	# traefik.http.services.textgen-stablediff-service.loadbalancer.server.port: 7861 # Stable diffusion (and openai embedding?)

	# ### Textgen API (blocking) 'http://textgen.your.domain:5000/api/v1/chat'
	# traefik.http.routers.textgen-api-blocking.rule: Host(`textgen.your.domain`) \|\| Host(`textgen`) \|\| Host(`textgen-api-blocking.your.domain`)
	# traefik.http.routers.textgen-api-blocking.tls.certresolver: le
	# traefik.http.routers.textgen-api-blocking.entrypoints: textgenapi # port 5000
	# traefik.http.routers.textgen-api-blocking.tls.domains[0].main: "*.your.domain"
	# traefik.http.routers.textgen-api-blocking.service: textgen-api-blocking-service
	# traefik.http.services.textgen-api-blocking-service.loadbalancer.server.port: 5000 # Textgen API (blocking)

	# ### OpenAI compatible API 'https://openai.your.domain/v1'
	# traefik.http.routers.openai.rule: Host(`openai.your.domain`) \|\| Host(`openai`)
	# traefik.http.routers.openai.tls.certresolver: le
	# traefik.http.routers.openai.entrypoints: websecure # port 5001
	# traefik.http.routers.openai.tls.domains[0].main: "*.your.domain"
	# traefik.http.routers.openai.service: openai-service
	# traefik.http.services.openai-service.loadbalancer.server.port: 5001 # OpenAI compatible API (https://openai.your.domain/v1)

	# ### Textgen Websockets API (Stream) 'ws://textgen.your.domain:5005/api/v1/chat-stream'
	# traefik.http.routers.textgen-api-ws.rule: Host(`textgen.your.domain`) \|\| Host(`textgen`) \|\| Host(`textgen-api-ws.your.domain`)
	# traefik.http.routers.textgen-api-ws.tls.certresolver: le
	# traefik.http.routers.textgen-api-ws.entrypoints: websockets # port 5005
	# traefik.http.routers.textgen-api-ws.tls.domains[0].main: "*.your.domain"
	# traefik.http.routers.textgen-api-ws.service: textgen-api-ws-service
	# # traefik.http.routers.textgen-api-ws-service.middlewares: websocketsSSL
	# traefik.http.services.textgen-api-ws-service.loadbalancer.server.port: 5005 # Textgen API (Stream)

	&name homeassistant:
	<<: [ restart, secopts]
	container_name: *name
	image: "ghcr.io/home-assistant/home-assistant:stable"
	volumes:
	- ${ha_config}:/config
	- DOCKER_MODS=linuxserver/mods:homeassistant-hacs
	network_mode: host # optionally remove if using traefik
	# cap_add:
	# - CAP_NET_RAW # optionally remove if using traefik
	# - CAP_NET_BIND_SERVICE # optionally remove if using traefik
	#devices:
	# # - /dev/ttyACM0:/dev/ttyACM0
	# - /dev/ttyUSB0:/dev/ttyUSB0
	# - /dev/zigbee:/dev/zigbee
	# - /dev/hidraw0:/dev/hidraw0
	# - /dev/hidraw1:/dev/hidraw1
	# - /dev/hidraw2:/dev/hidraw2
	#- /dev/usb/hiddev0:/dev/usb/hiddev0
	# - /dev/hci0:/dev/hci0
	# - /sys/class/bluetooth/hci0:/sys/class/bluetooth/hci0
	# labels:
	# traefik.enable: true
	# traefik.http.routers.ha.rule: "Host(`ha.your.domain`) \|\| Host(`homeassistant.your.domain`) \|\| Host(`homeassistant.your.domain`) \|\| Host(`hass.your.domain`)"
	# traefik.http.routers.ha.tls.certresolver: le
	# traefik.http.routers.ha.entrypoints: websecure
	# traefik.http.routers.ha.tls.domains[0].main: "*.your.domain"
	# traefik.http.routers.ha.service: ha
	# traefik.http.services.ha-service.loadbalancer.server.port: 8123
	dark_theme: false
	show_controls: true
	start_with: ''
	mode: chat
	chat_style: cai-chat
	preset: Divine Intellect
	max_new_tokens: 2048
	max_new_tokens_min: 1
	max_new_tokens_max: 4096
	seed: -1
	negative_prompt: ''
	truncation_length: 32768
	truncation_length_min: 0
	truncation_length_max: 16384
	custom_stopping_strings: ''
	auto_max_new_tokens: true
	max_tokens_second: 0
	ban_eos_token: false
	custom_token_bans: ''
	add_bos_token: false
	skip_special_tokens: true
	stream: true
	name1: You
	character: Assistant
	instruction_template: Llama-v2
	chat-instruct_command: ''
	autoload_model: true
	default_extensions:
	- api
	- openai
	name2: Assistant
	context: This is a conversation with your Assistant designed to help with various tasks such as answering questions, providing recommendations, improving code and helping with decision making. You can ask it anything you want and it will do its best to give you accurate and relevant information.
	greeting: null
	turn_template: null
	stop_at_newline: false
	chat_generation_attempts: 1
	chat_generation_attempts_min: 1
	chat_generation_attempts_max: 10
	chat_default_extensions:
	- gallery
	prompt: QA
	code_syntax_highlight-activate: true
	code_syntax_highlight-inline_highlight: true
	code_syntax_highlight-copy_button: true
	model: llava-v1.5-13b
	multimodal_pipeline: llava-llama-2-13b
	openai-port: 5001
	openai-embedding_device: cuda
	openai-sd_webui_url: http://127.0.0.1:7861
	openai-debug: 1