Skip to content

Instantly share code, notes, and snippets.

@sammcj
Last active May 20, 2024 13:15
Show Gist options
  • Save sammcj/4bbcc85d7ffd5ccc76a3f8bb8dee1d2b to your computer and use it in GitHub Desktop.
Save sammcj/4bbcc85d7ffd5ccc76a3f8bb8dee1d2b to your computer and use it in GitHub Desktop.
Willow Speech + Local LLM + HomeAssistant
# https://github.com/oobabooga/text-generation-webui/blob/main/README.md
# https://github.com/oobabooga/text-generation-webui/blob/main/docs/Spell-book.md
# by default the Dockerfile specifies these versions: 3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX
# https://developer.nvidia.com/cuda-gpus you can find the version for your card here
# Tesla P100 = sm_60, sm_61, sm_62 and compute_60, compute_61, compute_62
# TORCH_CUDA_ARCH_LIST=6.0,6.1,6.2,7.0,7.5,8.0,8.6+PTX
# RTX3090 = sm_86 and compute_86 (PTX)
#8.6+PTX
# Tesla P100 = sm_60, sm_61, sm_62 and compute_60, compute_61, compute_62
# RTX 3090 = sm_86 and compute_86, and +PTX
# +PTX which is an intermediate representation that allows kernels to runtime-compile for any CC >= the specified CC with a performance penalty
TORCH_CUDA_ARCH_LIST='8.6;8.6+PTX'
# the model and extensions are now managed in textgen_settings.yaml
CLI_ARGS=--listen --listen-host 0.0.0.0 --rwkv-cuda-on --xformers --extensions api openai
# txt2image api
SD_WEBUI_URL=http://0.0.0.0:7861
# OpenAI api = http://0.0.0.0:5001/v1
OPENEDAI_PORT=5001
# https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai
LISTEN_HOST=0.0.0.0
HOST_PORT=7860
CONTAINER_PORT=7860 # main webui
# the port the api blocking endpoint binds to on the host
BLOCKING_PORT=5001 # openai api
HOST_BLOCKING_PORT=5001
# the port the api binds to on the host
HOST_API_PORT=7861 # textgen api
CONTAINER_API_PORT=7861
# the port the api stream endpoint binds to on the host
HOST_API_STREAM_PORT=5005 # Textgen streaming api
HOST_STREAMING_PORT=5005
CONTAINER_API_STREAM_PORT=5005
STREAMING_PORT=5005
# the version used to install text-generation-webui from
WEBUI_VERSION=HEAD
INSTALL_EXTENSIONS=true
# maybe -march=native?
CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_CUBLAS=on -DLLAMA_CLBLAST=on -DLLAMA_HIPBLAS=on -DLLAMA_F16C=on -DLLAMA_AVX512=on -DLLAMA_AVX2=on -DLLAMA_FMA=on"
GPU_FLAGS=all
GPUS=all
FORCE_CMAKE=1
WEBUI_VERSION=HEAD
FORCE_CUDA=1
# Which docker image to run
IMAGE=${IMAGE:-willow-inference-server}
# HTTPS Listen port
LISTEN_PORT_HTTPS=${LISTEN_PORT_HTTPS:-19001}
# Listen port
LISTEN_PORT=${LISTEN_PORT:-19000}
# Media port range
# WebRTC dynamically negotiates UDP ports for each session
# You should keep this as small as possible for expected WebRTC connectionsMEDIA_PORT_RANGE=${MEDIA_PORT_RANGE:-10000-10050}
# Listen IP
LISTEN_IP=${LISTEN_IP:-0.0.0.0}
# GPUS - WIP for docker compose
GPUS="all"
# Allow forwarded IPs. This is a list of hosts to allow parsing of X-Forwarded headers from
# FORWARDED_ALLOW_IPS=
# allow all
FORWARDED_ALLOW_IPS="*"
# Shared memory size for docker
SHM_SIZE=1g
# Docker image tag
TAG=latest
NAME=wis
# c2translate config options
CT2_VERBOSE=0
QUANT="float16"
container="docker"
# Log level - acceptable values are debug, info, warning, error, critical. Suggest info or debug.
LOG_LEVEL=${LOG_LEVEL:-warning}
# Media port range
# WebRTC dynamically negotiates UDP ports for each session
# You should keep this as small as possible for expected WebRTC connections
MEDIA_PORT_RANGE=${MEDIA_PORT_RANGE:-10000-10050}
### TBC if these work as envs ###
# The default whisper model to use. Options are "tiny", "base", "small", "medium", "large"
WHISPER_MODEL_DEFAULT="medium"
from functools import lru_cache
from pydantic import BaseSettings
from typing import List
class APISettings(BaseSettings):
# # Project metadata
# name: str = "Willow Inference Server"
# description: str = "High Performance Language Inference API"
# version: str = "1.0"
# # Note: More beams is more accurate but slower.
# # default beam_size - 5 is lib default, 1 for greedy
# beam_size: int = 1
# # default beam size for longer transcriptions
# long_beam_size: int = 3
# # Audio duration in ms to activate "long" mode. Any audio longer than this will use long_beam_size.
# long_beam_size_threshold: int = 12000
# model_threads: int = 10
# # Default language
# language: str = "en"
# # Default detect language?
# detect_language: bool = False
# # if False, load models only on first use
# # this saves GPU ram but costs latency on first calls
preload_all_models: bool = False
# # Models to preload
# # if preload_all_models is True, these are irrelevant
# preload_whisper_model_tiny = True
# preload_whisper_model_base = True
# preload_whisper_model_small = True
preload_whisper_model_medium = True
# preload_whisper_model_large = True
preload_chatbot_model = True # only used if support_chatbot is True too
preload_tts_model = True # only used if support_tts is True too
# # TTS CUDA memory threshold - equivalent of 4GB GPUs
# tts_memory_threshold: int = 3798205849
# # SV CUDA memory threshold - equivalent of 6GB GPUs
# sv_memory_threshold: int = 5798205849
# # Enable chunking support
# support_chunking: bool = True
# # There is really no reason to disable chunking anymore
# # But if you still want to, you can set this threshold higher
# # current value is equivalent of 4GB GPUs
# chunking_memory_threshold: int = 3798205849
# # Maximum number of chunks that are loaded into the GPU at once
# # This will need to be tweaked based on GPU ram and model used.
# # 8GB GPUs should support at least 2 chunks so starting with that
# concurrent_gpu_chunks: int = 2
# # Enable TTS
support_tts: bool = True
# # Enable SV
# support_sv: bool = False
# # SV threshold
# sv_threshold: float = 0.75
# # The default whisper model to use. Options are "tiny", "base", "small", "medium", "large"
whisper_model_default: str = 'medium'
# # Default TTS format to use
# tts_default_format: str = "FLAC"
# # Default TTS speaker to use. CLB is US female
# tts_default_speaker: str = "CLB"
# # List of allowed origins for WebRTC. See https://fastapi.tiangolo.com/tutorial/cors/#use-corsmiddleware
# cors_allowed_origins: List[str] = []
# # If basic_auth_pass or basic_auth_user are set all endpoints are guarded by basic auth
# # If basic_auth_user is falsy it will not be checked. If basic_auth_pass is falsy it will not be checked.
# basic_auth_user: str = None
# basic_auth_pass: str = None
# # Support chatbot
support_chatbot: bool = True
# # Path to chatbot model - download from HuggingFace at runtime by default (gets cached)
chatbot_model_path: str = 'TheBloke/vicuna-13B-v1.5-GPTQ'
# # Chatbot pipeline default temperature
# chatbot_temperature: float = 0.7
# # Chatbot pipeline default top_p
# chatbot_top_p: float = 0.95
# # Chatbot pipeline default repetition penalty
# chatbot_repetition_penalty: float = 1.15
# # Chatbot pipeline default max new tokens
chatbot_max_new_tokens: int = 2048
# # airotc debug for connectivity and other WebRTC debugging
# aiortc_debug: bool = False
class Config:
env_prefix = ""
case_sensitive = False
@lru_cache()
def get_api_settings() -> APISettings:
return APISettings() # reads variables from environment
---
### Willow / Textgen / Home Assistant Docker Compose ###
#
# WARNING: this is an EXAMPLE ONLY, and will not work as-is!
#
# Traefik is optional, but some people have asked how I use it.
# I have a separate docker-compose file for traefik v3 which is pretty specific to my setup.
# I've probably missed a bunch of things, but you get the idea hopefully and I'll do a proper blog post at some point.
#
###
### YAML Anchors ###
x-gpu: &gpu
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: ["compute", "utility", "graphics"]
x-restart: &restart
restart: unless-stopped
x-secopts: &secopts
security_opt:
- no-new-privileges:true
x-ai-common: &ai-common
<<: [ *restart, *secopts]
environment:
- PUID=${PUID:-1001}
- PGID=${PGID:-1001}
environment:
wis_git: /home/username/git/wis
wis_config: /home/username/wis_config
textgen_models: /home/username/llm_models
textgen_config: /home/username/textgen_config
ha_config: /home/username/ha_config
volumes:
cache:
wis:
textgen:
networks:
willow:
external: true # docker network create willow
name: willow
####
services:
&name willow: # WAS - willow application server
container_name: *name
env_file:
- .willow.env
hostname: *name
<<: [*ai-common, *gpu]
image: ghcr.io/toverainc/willow-application-server:latest
shm_size: 1g
ipc: host # https://docs.docker.com/engine/reference/run/#ipc-settings---ipc
ulimits:
memlock: -1
stack: 67108864
ports:
- 8502 # optionally remove if using traefik
volumes:
- cache:/root/.cache
networks:
- willow
# labels:
# traefik.enable: true
# traefik.http.routers.willow.rule: Host(`willow.your.domain`)
# traefik.http.routers.willow.entrypoints: websecure
# traefik.http.routers.willow.tls: true
# traefik.http.routers.willow.tls.certresolver: le
# traefik.http.routers.willow.service: willow-service
# traefik.http.services.willow-service.loadbalancer.server.port: 8502
&name wis-nginx:
container_name: *name
env_file:
- .willow.env
hostname: *name
<<: [*ai-common, *limits-mem-512]
depends_on:
- wis
image: nginx:latest
volumes:
- ${wis_git}/nginx:/nginx
environment:
- NGINX_ENTRYPOINT_WORKER_PROCESSES_AUTOTUNE=1
ports:
- 19000 # optionally remove if using traefik
- 19001 # optionally remove if using traefik
networks:
- willow
# labels:
# traefik.enable: true
# traefik.http.routers.wis-nginx.rule: Host(`wis-nginx.your.domain`)
# traefik.http.routers.wis-nginx.tls.certresolver: le
# traefik.http.routers.wis-nginx.entrypoints: websecure
# traefik.http.routers.wis-nginx.tls.domains[0].main: "*.your.domain"
# traefik.http.routers.wis-nginx.service: wis-nginx-service
# traefik.http.services.wis-nginx-service.loadbalancer.server.port: 19001
### Willow Inference Server for TTS ###
&name wis:
container_name: *name
env_file:
- .willow.env
hostname: *name
<<: [*ai-common, *gpu]
profiles:
- *name
image: willow-inference-server:latest
shm_size: 1g
ipc: host # https://docs.docker.com/engine/reference/run/#ipc-settings---ipc
ulimits:
memlock: -1
stack: 67108864
build:
context: https://github.com/toverainc/willow-inference-server.git#main
dockerfile: Dockerfile
ports:
- 0.0.0.0:10002-10050:10002-10050 # optionally remove if using traefik
- 19000:19000 # optionally remove if using traefik
- 19001:19001 # optionally remove if using traefik
volumes:
- cache:/root/.cache
- ${willow_config}/custom_settings.py:/app/custom_settings.py:ro
- ${wis_git}:/app
command: ./entrypoint.sh
networks:
- willow
# labels:
# TODO: my traefik setup for the WIS is missing a few things that would allow the port mappings to be removed
# traefik.enable: true
# traefik.http.routers.wis.rule: Host(`wis.your.domain`)
# traefik.http.routers.wis.entrypoints: websecure
# traefik.http.routers.wis.tls: true
# traefik.http.routers.wis.tls.certresolver: le
# traefik.http.routers.wis.tls.domains[0].main: "*.your.domain"
# traefik.http.services.wis-service.loadbalancer.server.port: 19000
# # Middleware for Buffering (Traefik doesn't support disabling buffering, but you can set limits)
# traefik.http.middlewares.wis-buffering.buffering.maxRequestBodyBytes: 104857600 # 100MB
# traefik.http.middlewares.wis-buffering.buffering.maxResponseBodyBytes: 104857600 # 100MB
# # Apply Middlewares
# traefik.http.routers.wis.middlewares: wis-buffering #wis-headers
&name textgen:
container_name: *name
env_file:
- .textgen.env
hostname: *name
<<: [*ai-common, *gpu]
build:
context: https://github.com/oobabooga/text-generation-webui.git#main
dockerfile: docker/Dockerfile
tags:
- docker.io/yourusername/textgen:latest #latest
shm_size: "8gb"
args:
# Tesla P100 = sm_60, sm_61, sm_62 and compute_60, compute_61, compute_62
# RTX 3090 = sm_86 and compute_86
- "TORCH_CUDA_ARCH_LIST=8.6;8.6+PTX"
- GPUS=all
- TAG=docker.io/yourusername/textgen:latest #latest #cuda12
- CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_CUBLAS=on -DLLAMA_CLBLAST=on -DLLAMA_HIPBLAS=on -DLLAMA_F16C=on -DLLAMA_AVX512=on -DLLAMA_AVX2=on -DLLAMA_FMA=on"
- FORCE_CMAKE=1
- WEBUI_VERSION=HEAD
- FORCE_CUDA=1
security_opt:
- seccomp:unconfined
stdin_open: true
tty: true
shm_size: "16gb"
networks:
- willow
ports:
- "7860:7860" # Gradio Web UI
- "7861:7861" # Textgen API (blocking)
- "5001:5001" # OpenAI compatible API (https://localai.your.domain/v1)
- "5005:5005" # Textgen Websockets API (Stream)
volumes:
- cache:/root/.cache:rw
- ${textgen_models}:/app/models:cached
- ${textgen_config}/textgen_settings.yaml:/app/settings.yaml
# labels:
# traefik.enable: true
# traefik.http.routers.textgen.rule: Host(`textgen.your.domain`) || Host(`textgen`)
# traefik.http.routers.textgen.tls.certresolver: le
# traefik.http.routers.textgen.entrypoints: websecure
# traefik.http.routers.textgen.tls.domains[0].main: "*.your.domain"
# traefik.http.routers.textgen.service: textgen-service
# traefik.http.services.textgen-service.loadbalancer.server.port: 7860 # Web UI
# ### Stablediff
# traefik.http.routers.textgen-stablediff.rule: Host(`textgen.your.domain`) || Host(`textgen`) || Host(`textgen-stablediff.your.domain`)
# traefik.http.routers.textgen-stablediff.tls.certresolver: le
# traefik.http.routers.textgen-stablediff.entrypoints: stablediff # port 7861
# traefik.http.routers.textgen-stablediff.tls.domains[0].main: "*.your.domain"
# traefik.http.routers.textgen-stablediff.service: textgen-stablediff-service
# traefik.http.services.textgen-stablediff-service.loadbalancer.server.port: 7861 # Stable diffusion (and openai embedding?)
# ### Textgen API (blocking) 'http://textgen.your.domain:5000/api/v1/chat'
# traefik.http.routers.textgen-api-blocking.rule: Host(`textgen.your.domain`) || Host(`textgen`) || Host(`textgen-api-blocking.your.domain`)
# traefik.http.routers.textgen-api-blocking.tls.certresolver: le
# traefik.http.routers.textgen-api-blocking.entrypoints: textgenapi # port 5000
# traefik.http.routers.textgen-api-blocking.tls.domains[0].main: "*.your.domain"
# traefik.http.routers.textgen-api-blocking.service: textgen-api-blocking-service
# traefik.http.services.textgen-api-blocking-service.loadbalancer.server.port: 5000 # Textgen API (blocking)
# ### OpenAI compatible API 'https://openai.your.domain/v1'
# traefik.http.routers.openai.rule: Host(`openai.your.domain`) || Host(`openai`)
# traefik.http.routers.openai.tls.certresolver: le
# traefik.http.routers.openai.entrypoints: websecure # port 5001
# traefik.http.routers.openai.tls.domains[0].main: "*.your.domain"
# traefik.http.routers.openai.service: openai-service
# traefik.http.services.openai-service.loadbalancer.server.port: 5001 # OpenAI compatible API (https://openai.your.domain/v1)
# ### Textgen Websockets API (Stream) 'ws://textgen.your.domain:5005/api/v1/chat-stream'
# traefik.http.routers.textgen-api-ws.rule: Host(`textgen.your.domain`) || Host(`textgen`) || Host(`textgen-api-ws.your.domain`)
# traefik.http.routers.textgen-api-ws.tls.certresolver: le
# traefik.http.routers.textgen-api-ws.entrypoints: websockets # port 5005
# traefik.http.routers.textgen-api-ws.tls.domains[0].main: "*.your.domain"
# traefik.http.routers.textgen-api-ws.service: textgen-api-ws-service
# # traefik.http.routers.textgen-api-ws-service.middlewares: websocketsSSL
# traefik.http.services.textgen-api-ws-service.loadbalancer.server.port: 5005 # Textgen API (Stream)
&name homeassistant:
<<: [ *restart, *secopts]
container_name: *name
image: "ghcr.io/home-assistant/home-assistant:stable"
volumes:
- ${ha_config}:/config
- DOCKER_MODS=linuxserver/mods:homeassistant-hacs
network_mode: host # optionally remove if using traefik
# cap_add:
# - CAP_NET_RAW # optionally remove if using traefik
# - CAP_NET_BIND_SERVICE # optionally remove if using traefik
#devices:
# # - /dev/ttyACM0:/dev/ttyACM0
# - /dev/ttyUSB0:/dev/ttyUSB0
# - /dev/zigbee:/dev/zigbee
# - /dev/hidraw0:/dev/hidraw0
# - /dev/hidraw1:/dev/hidraw1
# - /dev/hidraw2:/dev/hidraw2
#- /dev/usb/hiddev0:/dev/usb/hiddev0
# - /dev/hci0:/dev/hci0
# - /sys/class/bluetooth/hci0:/sys/class/bluetooth/hci0
# labels:
# traefik.enable: true
# traefik.http.routers.ha.rule: "Host(`ha.your.domain`) || Host(`homeassistant.your.domain`) || Host(`homeassistant.your.domain`) || Host(`hass.your.domain`)"
# traefik.http.routers.ha.tls.certresolver: le
# traefik.http.routers.ha.entrypoints: websecure
# traefik.http.routers.ha.tls.domains[0].main: "*.your.domain"
# traefik.http.routers.ha.service: ha
# traefik.http.services.ha-service.loadbalancer.server.port: 8123
dark_theme: false
show_controls: true
start_with: ''
mode: chat
chat_style: cai-chat
preset: Divine Intellect
max_new_tokens: 2048
max_new_tokens_min: 1
max_new_tokens_max: 4096
seed: -1
negative_prompt: ''
truncation_length: 32768
truncation_length_min: 0
truncation_length_max: 16384
custom_stopping_strings: ''
auto_max_new_tokens: true
max_tokens_second: 0
ban_eos_token: false
custom_token_bans: ''
add_bos_token: false
skip_special_tokens: true
stream: true
name1: You
character: Assistant
instruction_template: Llama-v2
chat-instruct_command: ''
autoload_model: true
default_extensions:
- api
- openai
name2: Assistant
context: This is a conversation with your Assistant designed to help with various tasks such as answering questions, providing recommendations, improving code and helping with decision making. You can ask it anything you want and it will do its best to give you accurate and relevant information.
greeting: null
turn_template: null
stop_at_newline: false
chat_generation_attempts: 1
chat_generation_attempts_min: 1
chat_generation_attempts_max: 10
chat_default_extensions:
- gallery
prompt: QA
code_syntax_highlight-activate: true
code_syntax_highlight-inline_highlight: true
code_syntax_highlight-copy_button: true
model: llava-v1.5-13b
multimodal_pipeline: llava-llama-2-13b
openai-port: 5001
openai-embedding_device: cuda
openai-sd_webui_url: http://127.0.0.1:7861
openai-debug: 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment