Last active
May 20, 2024 13:15
-
-
Save sammcj/4bbcc85d7ffd5ccc76a3f8bb8dee1d2b to your computer and use it in GitHub Desktop.
Willow Speech + Local LLM + HomeAssistant
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://github.com/oobabooga/text-generation-webui/blob/main/README.md | |
# https://github.com/oobabooga/text-generation-webui/blob/main/docs/Spell-book.md | |
# by default the Dockerfile specifies these versions: 3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX | |
# https://developer.nvidia.com/cuda-gpus you can find the version for your card here | |
# Tesla P100 = sm_60, sm_61, sm_62 and compute_60, compute_61, compute_62 | |
# TORCH_CUDA_ARCH_LIST=6.0,6.1,6.2,7.0,7.5,8.0,8.6+PTX | |
# RTX3090 = sm_86 and compute_86 (PTX) | |
#8.6+PTX | |
# Tesla P100 = sm_60, sm_61, sm_62 and compute_60, compute_61, compute_62 | |
# RTX 3090 = sm_86 and compute_86, and +PTX | |
# +PTX which is an intermediate representation that allows kernels to runtime-compile for any CC >= the specified CC with a performance penalty | |
TORCH_CUDA_ARCH_LIST='8.6;8.6+PTX' | |
# the model and extensions are now managed in textgen_settings.yaml | |
CLI_ARGS=--listen --listen-host 0.0.0.0 --rwkv-cuda-on --xformers --extensions api openai | |
# txt2image api | |
SD_WEBUI_URL=http://0.0.0.0:7861 | |
# OpenAI api = http://0.0.0.0:5001/v1 | |
OPENEDAI_PORT=5001 | |
# https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai | |
LISTEN_HOST=0.0.0.0 | |
HOST_PORT=7860 | |
CONTAINER_PORT=7860 # main webui | |
# the port the api blocking endpoint binds to on the host | |
BLOCKING_PORT=5001 # openai api | |
HOST_BLOCKING_PORT=5001 | |
# the port the api binds to on the host | |
HOST_API_PORT=7861 # textgen api | |
CONTAINER_API_PORT=7861 | |
# the port the api stream endpoint binds to on the host | |
HOST_API_STREAM_PORT=5005 # Textgen streaming api | |
HOST_STREAMING_PORT=5005 | |
CONTAINER_API_STREAM_PORT=5005 | |
STREAMING_PORT=5005 | |
# the version used to install text-generation-webui from | |
WEBUI_VERSION=HEAD | |
INSTALL_EXTENSIONS=true | |
# maybe -march=native? | |
CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_CUBLAS=on -DLLAMA_CLBLAST=on -DLLAMA_HIPBLAS=on -DLLAMA_F16C=on -DLLAMA_AVX512=on -DLLAMA_AVX2=on -DLLAMA_FMA=on" | |
GPU_FLAGS=all | |
GPUS=all | |
FORCE_CMAKE=1 | |
WEBUI_VERSION=HEAD | |
FORCE_CUDA=1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Which docker image to run | |
IMAGE=${IMAGE:-willow-inference-server} | |
# HTTPS Listen port | |
LISTEN_PORT_HTTPS=${LISTEN_PORT_HTTPS:-19001} | |
# Listen port | |
LISTEN_PORT=${LISTEN_PORT:-19000} | |
# Media port range | |
# WebRTC dynamically negotiates UDP ports for each session | |
# You should keep this as small as possible for expected WebRTC connectionsMEDIA_PORT_RANGE=${MEDIA_PORT_RANGE:-10000-10050} | |
# Listen IP | |
LISTEN_IP=${LISTEN_IP:-0.0.0.0} | |
# GPUS - WIP for docker compose | |
GPUS="all" | |
# Allow forwarded IPs. This is a list of hosts to allow parsing of X-Forwarded headers from | |
# FORWARDED_ALLOW_IPS= | |
# allow all | |
FORWARDED_ALLOW_IPS="*" | |
# Shared memory size for docker | |
SHM_SIZE=1g | |
# Docker image tag | |
TAG=latest | |
NAME=wis | |
# c2translate config options | |
CT2_VERBOSE=0 | |
QUANT="float16" | |
container="docker" | |
# Log level - acceptable values are debug, info, warning, error, critical. Suggest info or debug. | |
LOG_LEVEL=${LOG_LEVEL:-warning} | |
# Media port range | |
# WebRTC dynamically negotiates UDP ports for each session | |
# You should keep this as small as possible for expected WebRTC connections | |
MEDIA_PORT_RANGE=${MEDIA_PORT_RANGE:-10000-10050} | |
### TBC if these work as envs ### | |
# The default whisper model to use. Options are "tiny", "base", "small", "medium", "large" | |
WHISPER_MODEL_DEFAULT="medium" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from functools import lru_cache | |
from pydantic import BaseSettings | |
from typing import List | |
class APISettings(BaseSettings): | |
# # Project metadata | |
# name: str = "Willow Inference Server" | |
# description: str = "High Performance Language Inference API" | |
# version: str = "1.0" | |
# # Note: More beams is more accurate but slower. | |
# # default beam_size - 5 is lib default, 1 for greedy | |
# beam_size: int = 1 | |
# # default beam size for longer transcriptions | |
# long_beam_size: int = 3 | |
# # Audio duration in ms to activate "long" mode. Any audio longer than this will use long_beam_size. | |
# long_beam_size_threshold: int = 12000 | |
# model_threads: int = 10 | |
# # Default language | |
# language: str = "en" | |
# # Default detect language? | |
# detect_language: bool = False | |
# # if False, load models only on first use | |
# # this saves GPU ram but costs latency on first calls | |
preload_all_models: bool = False | |
# # Models to preload | |
# # if preload_all_models is True, these are irrelevant | |
# preload_whisper_model_tiny = True | |
# preload_whisper_model_base = True | |
# preload_whisper_model_small = True | |
preload_whisper_model_medium = True | |
# preload_whisper_model_large = True | |
preload_chatbot_model = True # only used if support_chatbot is True too | |
preload_tts_model = True # only used if support_tts is True too | |
# # TTS CUDA memory threshold - equivalent of 4GB GPUs | |
# tts_memory_threshold: int = 3798205849 | |
# # SV CUDA memory threshold - equivalent of 6GB GPUs | |
# sv_memory_threshold: int = 5798205849 | |
# # Enable chunking support | |
# support_chunking: bool = True | |
# # There is really no reason to disable chunking anymore | |
# # But if you still want to, you can set this threshold higher | |
# # current value is equivalent of 4GB GPUs | |
# chunking_memory_threshold: int = 3798205849 | |
# # Maximum number of chunks that are loaded into the GPU at once | |
# # This will need to be tweaked based on GPU ram and model used. | |
# # 8GB GPUs should support at least 2 chunks so starting with that | |
# concurrent_gpu_chunks: int = 2 | |
# # Enable TTS | |
support_tts: bool = True | |
# # Enable SV | |
# support_sv: bool = False | |
# # SV threshold | |
# sv_threshold: float = 0.75 | |
# # The default whisper model to use. Options are "tiny", "base", "small", "medium", "large" | |
whisper_model_default: str = 'medium' | |
# # Default TTS format to use | |
# tts_default_format: str = "FLAC" | |
# # Default TTS speaker to use. CLB is US female | |
# tts_default_speaker: str = "CLB" | |
# # List of allowed origins for WebRTC. See https://fastapi.tiangolo.com/tutorial/cors/#use-corsmiddleware | |
# cors_allowed_origins: List[str] = [] | |
# # If basic_auth_pass or basic_auth_user are set all endpoints are guarded by basic auth | |
# # If basic_auth_user is falsy it will not be checked. If basic_auth_pass is falsy it will not be checked. | |
# basic_auth_user: str = None | |
# basic_auth_pass: str = None | |
# # Support chatbot | |
support_chatbot: bool = True | |
# # Path to chatbot model - download from HuggingFace at runtime by default (gets cached) | |
chatbot_model_path: str = 'TheBloke/vicuna-13B-v1.5-GPTQ' | |
# # Chatbot pipeline default temperature | |
# chatbot_temperature: float = 0.7 | |
# # Chatbot pipeline default top_p | |
# chatbot_top_p: float = 0.95 | |
# # Chatbot pipeline default repetition penalty | |
# chatbot_repetition_penalty: float = 1.15 | |
# # Chatbot pipeline default max new tokens | |
chatbot_max_new_tokens: int = 2048 | |
# # airotc debug for connectivity and other WebRTC debugging | |
# aiortc_debug: bool = False | |
class Config: | |
env_prefix = "" | |
case_sensitive = False | |
@lru_cache() | |
def get_api_settings() -> APISettings: | |
return APISettings() # reads variables from environment |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
### Willow / Textgen / Home Assistant Docker Compose ### | |
# | |
# WARNING: this is an EXAMPLE ONLY, and will not work as-is! | |
# | |
# Traefik is optional, but some people have asked how I use it. | |
# I have a separate docker-compose file for traefik v3 which is pretty specific to my setup. | |
# I've probably missed a bunch of things, but you get the idea hopefully and I'll do a proper blog post at some point. | |
# | |
### | |
### YAML Anchors ### | |
x-gpu: &gpu | |
runtime: nvidia | |
deploy: | |
resources: | |
reservations: | |
devices: | |
- driver: nvidia | |
count: all | |
capabilities: ["compute", "utility", "graphics"] | |
x-restart: &restart | |
restart: unless-stopped | |
x-secopts: &secopts | |
security_opt: | |
- no-new-privileges:true | |
x-ai-common: &ai-common | |
<<: [ *restart, *secopts] | |
environment: | |
- PUID=${PUID:-1001} | |
- PGID=${PGID:-1001} | |
environment: | |
wis_git: /home/username/git/wis | |
wis_config: /home/username/wis_config | |
textgen_models: /home/username/llm_models | |
textgen_config: /home/username/textgen_config | |
ha_config: /home/username/ha_config | |
volumes: | |
cache: | |
wis: | |
textgen: | |
networks: | |
willow: | |
external: true # docker network create willow | |
name: willow | |
#### | |
services: | |
&name willow: # WAS - willow application server | |
container_name: *name | |
env_file: | |
- .willow.env | |
hostname: *name | |
<<: [*ai-common, *gpu] | |
image: ghcr.io/toverainc/willow-application-server:latest | |
shm_size: 1g | |
ipc: host # https://docs.docker.com/engine/reference/run/#ipc-settings---ipc | |
ulimits: | |
memlock: -1 | |
stack: 67108864 | |
ports: | |
- 8502 # optionally remove if using traefik | |
volumes: | |
- cache:/root/.cache | |
networks: | |
- willow | |
# labels: | |
# traefik.enable: true | |
# traefik.http.routers.willow.rule: Host(`willow.your.domain`) | |
# traefik.http.routers.willow.entrypoints: websecure | |
# traefik.http.routers.willow.tls: true | |
# traefik.http.routers.willow.tls.certresolver: le | |
# traefik.http.routers.willow.service: willow-service | |
# traefik.http.services.willow-service.loadbalancer.server.port: 8502 | |
&name wis-nginx: | |
container_name: *name | |
env_file: | |
- .willow.env | |
hostname: *name | |
<<: [*ai-common, *limits-mem-512] | |
depends_on: | |
- wis | |
image: nginx:latest | |
volumes: | |
- ${wis_git}/nginx:/nginx | |
environment: | |
- NGINX_ENTRYPOINT_WORKER_PROCESSES_AUTOTUNE=1 | |
ports: | |
- 19000 # optionally remove if using traefik | |
- 19001 # optionally remove if using traefik | |
networks: | |
- willow | |
# labels: | |
# traefik.enable: true | |
# traefik.http.routers.wis-nginx.rule: Host(`wis-nginx.your.domain`) | |
# traefik.http.routers.wis-nginx.tls.certresolver: le | |
# traefik.http.routers.wis-nginx.entrypoints: websecure | |
# traefik.http.routers.wis-nginx.tls.domains[0].main: "*.your.domain" | |
# traefik.http.routers.wis-nginx.service: wis-nginx-service | |
# traefik.http.services.wis-nginx-service.loadbalancer.server.port: 19001 | |
### Willow Inference Server for TTS ### | |
&name wis: | |
container_name: *name | |
env_file: | |
- .willow.env | |
hostname: *name | |
<<: [*ai-common, *gpu] | |
profiles: | |
- *name | |
image: willow-inference-server:latest | |
shm_size: 1g | |
ipc: host # https://docs.docker.com/engine/reference/run/#ipc-settings---ipc | |
ulimits: | |
memlock: -1 | |
stack: 67108864 | |
build: | |
context: https://github.com/toverainc/willow-inference-server.git#main | |
dockerfile: Dockerfile | |
ports: | |
- 0.0.0.0:10002-10050:10002-10050 # optionally remove if using traefik | |
- 19000:19000 # optionally remove if using traefik | |
- 19001:19001 # optionally remove if using traefik | |
volumes: | |
- cache:/root/.cache | |
- ${willow_config}/custom_settings.py:/app/custom_settings.py:ro | |
- ${wis_git}:/app | |
command: ./entrypoint.sh | |
networks: | |
- willow | |
# labels: | |
# TODO: my traefik setup for the WIS is missing a few things that would allow the port mappings to be removed | |
# traefik.enable: true | |
# traefik.http.routers.wis.rule: Host(`wis.your.domain`) | |
# traefik.http.routers.wis.entrypoints: websecure | |
# traefik.http.routers.wis.tls: true | |
# traefik.http.routers.wis.tls.certresolver: le | |
# traefik.http.routers.wis.tls.domains[0].main: "*.your.domain" | |
# traefik.http.services.wis-service.loadbalancer.server.port: 19000 | |
# # Middleware for Buffering (Traefik doesn't support disabling buffering, but you can set limits) | |
# traefik.http.middlewares.wis-buffering.buffering.maxRequestBodyBytes: 104857600 # 100MB | |
# traefik.http.middlewares.wis-buffering.buffering.maxResponseBodyBytes: 104857600 # 100MB | |
# # Apply Middlewares | |
# traefik.http.routers.wis.middlewares: wis-buffering #wis-headers | |
&name textgen: | |
container_name: *name | |
env_file: | |
- .textgen.env | |
hostname: *name | |
<<: [*ai-common, *gpu] | |
build: | |
context: https://github.com/oobabooga/text-generation-webui.git#main | |
dockerfile: docker/Dockerfile | |
tags: | |
- docker.io/yourusername/textgen:latest #latest | |
shm_size: "8gb" | |
args: | |
# Tesla P100 = sm_60, sm_61, sm_62 and compute_60, compute_61, compute_62 | |
# RTX 3090 = sm_86 and compute_86 | |
- "TORCH_CUDA_ARCH_LIST=8.6;8.6+PTX" | |
- GPUS=all | |
- TAG=docker.io/yourusername/textgen:latest #latest #cuda12 | |
- CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_CUBLAS=on -DLLAMA_CLBLAST=on -DLLAMA_HIPBLAS=on -DLLAMA_F16C=on -DLLAMA_AVX512=on -DLLAMA_AVX2=on -DLLAMA_FMA=on" | |
- FORCE_CMAKE=1 | |
- WEBUI_VERSION=HEAD | |
- FORCE_CUDA=1 | |
security_opt: | |
- seccomp:unconfined | |
stdin_open: true | |
tty: true | |
shm_size: "16gb" | |
networks: | |
- willow | |
ports: | |
- "7860:7860" # Gradio Web UI | |
- "7861:7861" # Textgen API (blocking) | |
- "5001:5001" # OpenAI compatible API (https://localai.your.domain/v1) | |
- "5005:5005" # Textgen Websockets API (Stream) | |
volumes: | |
- cache:/root/.cache:rw | |
- ${textgen_models}:/app/models:cached | |
- ${textgen_config}/textgen_settings.yaml:/app/settings.yaml | |
# labels: | |
# traefik.enable: true | |
# traefik.http.routers.textgen.rule: Host(`textgen.your.domain`) || Host(`textgen`) | |
# traefik.http.routers.textgen.tls.certresolver: le | |
# traefik.http.routers.textgen.entrypoints: websecure | |
# traefik.http.routers.textgen.tls.domains[0].main: "*.your.domain" | |
# traefik.http.routers.textgen.service: textgen-service | |
# traefik.http.services.textgen-service.loadbalancer.server.port: 7860 # Web UI | |
# ### Stablediff | |
# traefik.http.routers.textgen-stablediff.rule: Host(`textgen.your.domain`) || Host(`textgen`) || Host(`textgen-stablediff.your.domain`) | |
# traefik.http.routers.textgen-stablediff.tls.certresolver: le | |
# traefik.http.routers.textgen-stablediff.entrypoints: stablediff # port 7861 | |
# traefik.http.routers.textgen-stablediff.tls.domains[0].main: "*.your.domain" | |
# traefik.http.routers.textgen-stablediff.service: textgen-stablediff-service | |
# traefik.http.services.textgen-stablediff-service.loadbalancer.server.port: 7861 # Stable diffusion (and openai embedding?) | |
# ### Textgen API (blocking) 'http://textgen.your.domain:5000/api/v1/chat' | |
# traefik.http.routers.textgen-api-blocking.rule: Host(`textgen.your.domain`) || Host(`textgen`) || Host(`textgen-api-blocking.your.domain`) | |
# traefik.http.routers.textgen-api-blocking.tls.certresolver: le | |
# traefik.http.routers.textgen-api-blocking.entrypoints: textgenapi # port 5000 | |
# traefik.http.routers.textgen-api-blocking.tls.domains[0].main: "*.your.domain" | |
# traefik.http.routers.textgen-api-blocking.service: textgen-api-blocking-service | |
# traefik.http.services.textgen-api-blocking-service.loadbalancer.server.port: 5000 # Textgen API (blocking) | |
# ### OpenAI compatible API 'https://openai.your.domain/v1' | |
# traefik.http.routers.openai.rule: Host(`openai.your.domain`) || Host(`openai`) | |
# traefik.http.routers.openai.tls.certresolver: le | |
# traefik.http.routers.openai.entrypoints: websecure # port 5001 | |
# traefik.http.routers.openai.tls.domains[0].main: "*.your.domain" | |
# traefik.http.routers.openai.service: openai-service | |
# traefik.http.services.openai-service.loadbalancer.server.port: 5001 # OpenAI compatible API (https://openai.your.domain/v1) | |
# ### Textgen Websockets API (Stream) 'ws://textgen.your.domain:5005/api/v1/chat-stream' | |
# traefik.http.routers.textgen-api-ws.rule: Host(`textgen.your.domain`) || Host(`textgen`) || Host(`textgen-api-ws.your.domain`) | |
# traefik.http.routers.textgen-api-ws.tls.certresolver: le | |
# traefik.http.routers.textgen-api-ws.entrypoints: websockets # port 5005 | |
# traefik.http.routers.textgen-api-ws.tls.domains[0].main: "*.your.domain" | |
# traefik.http.routers.textgen-api-ws.service: textgen-api-ws-service | |
# # traefik.http.routers.textgen-api-ws-service.middlewares: websocketsSSL | |
# traefik.http.services.textgen-api-ws-service.loadbalancer.server.port: 5005 # Textgen API (Stream) | |
&name homeassistant: | |
<<: [ *restart, *secopts] | |
container_name: *name | |
image: "ghcr.io/home-assistant/home-assistant:stable" | |
volumes: | |
- ${ha_config}:/config | |
- DOCKER_MODS=linuxserver/mods:homeassistant-hacs | |
network_mode: host # optionally remove if using traefik | |
# cap_add: | |
# - CAP_NET_RAW # optionally remove if using traefik | |
# - CAP_NET_BIND_SERVICE # optionally remove if using traefik | |
#devices: | |
# # - /dev/ttyACM0:/dev/ttyACM0 | |
# - /dev/ttyUSB0:/dev/ttyUSB0 | |
# - /dev/zigbee:/dev/zigbee | |
# - /dev/hidraw0:/dev/hidraw0 | |
# - /dev/hidraw1:/dev/hidraw1 | |
# - /dev/hidraw2:/dev/hidraw2 | |
#- /dev/usb/hiddev0:/dev/usb/hiddev0 | |
# - /dev/hci0:/dev/hci0 | |
# - /sys/class/bluetooth/hci0:/sys/class/bluetooth/hci0 | |
# labels: | |
# traefik.enable: true | |
# traefik.http.routers.ha.rule: "Host(`ha.your.domain`) || Host(`homeassistant.your.domain`) || Host(`homeassistant.your.domain`) || Host(`hass.your.domain`)" | |
# traefik.http.routers.ha.tls.certresolver: le | |
# traefik.http.routers.ha.entrypoints: websecure | |
# traefik.http.routers.ha.tls.domains[0].main: "*.your.domain" | |
# traefik.http.routers.ha.service: ha | |
# traefik.http.services.ha-service.loadbalancer.server.port: 8123 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dark_theme: false | |
show_controls: true | |
start_with: '' | |
mode: chat | |
chat_style: cai-chat | |
preset: Divine Intellect | |
max_new_tokens: 2048 | |
max_new_tokens_min: 1 | |
max_new_tokens_max: 4096 | |
seed: -1 | |
negative_prompt: '' | |
truncation_length: 32768 | |
truncation_length_min: 0 | |
truncation_length_max: 16384 | |
custom_stopping_strings: '' | |
auto_max_new_tokens: true | |
max_tokens_second: 0 | |
ban_eos_token: false | |
custom_token_bans: '' | |
add_bos_token: false | |
skip_special_tokens: true | |
stream: true | |
name1: You | |
character: Assistant | |
instruction_template: Llama-v2 | |
chat-instruct_command: '' | |
autoload_model: true | |
default_extensions: | |
- api | |
- openai | |
name2: Assistant | |
context: This is a conversation with your Assistant designed to help with various tasks such as answering questions, providing recommendations, improving code and helping with decision making. You can ask it anything you want and it will do its best to give you accurate and relevant information. | |
greeting: null | |
turn_template: null | |
stop_at_newline: false | |
chat_generation_attempts: 1 | |
chat_generation_attempts_min: 1 | |
chat_generation_attempts_max: 10 | |
chat_default_extensions: | |
- gallery | |
prompt: QA | |
code_syntax_highlight-activate: true | |
code_syntax_highlight-inline_highlight: true | |
code_syntax_highlight-copy_button: true | |
model: llava-v1.5-13b | |
multimodal_pipeline: llava-llama-2-13b | |
openai-port: 5001 | |
openai-embedding_device: cuda | |
openai-sd_webui_url: http://127.0.0.1:7861 | |
openai-debug: 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment