Skip to content

Instantly share code, notes, and snippets.

@matthewhand
Last active September 11, 2025 19:23
Show Gist options
  • Save matthewhand/1040ad1fffa3eb3451097877fbe9938a to your computer and use it in GitHub Desktop.
Save matthewhand/1040ad1fffa3eb3451097877fbe9938a to your computer and use it in GitHub Desktop.
LiteLLM Load Balancing with free endpoints (nested configs)
SAMBANOVA_API_KEY=
GEMINI_API_KEY=
HUGGINGFACE_API_KEY=
HYPERBOLIC_API_KEY=
OPENAI_API_KEY=
GROQ_API_KEY=
XAI_API_KEY=
OPENROUTER_API_KEY=
COHERE_API_KEY=
TOGETHERAI_API_KEY=
CEREBRAS_API_KEY=
GLHF_API_KEY=
DEEPSEEK_API_KEY=
@reboot TERM=dumb bash -lc "cd ~/.litellm/ ; set -a ; source .env ; litellm --config litellm-config.yaml --host 0.0.0.0 --port 4001 --debug 2>&1 > /tmp/litellm-4001.log"
@reboot TERM=dumb bash -lc "cd ~/.litellm/ ; set -a ; source .env ; LITELLM_API_BASE=http://localhost:4001/ litellm --config litellm-config-aggregate.yaml --host 0.0.0.0 --port 4002 --debug 2>&1 > /tmp/litellm-4002.log"
model_list:
- model_name: free
litellm_params:
model: openai/DeepSeek-V3
api_base: os.environ/LITELLM_API_BASE
api_key: sk-doesnt-matter
weight: 1
rpm: 1
tpm: 10000
- model_name: free
litellm_params:
model: openai/gemini-2.0-flash-exp
api_base: os.environ/LITELLM_API_BASE
api_key: sk-doesnt-matter
weight: 2
rpm: 2
tpm: 20000
- model_name: free
litellm_params:
model: openai/Meta-Llama-3.1-405B-Instruct
api_base: os.environ/LITELLM_API_BASE
api_key: sk-doesnt-matter
weight: 3
rpm: 3
tpm: 30000
- model_name: free
litellm_params:
model: openai/Qwen2.5-Coder-32B-Instruct
api_base: os.environ/LITELLM_API_BASE
api_key: sk-doesnt-matter
weight: 4
rpm: 4
tpm: 40000
- model_name: free
litellm_params:
model: openai/Llama3.3-70B-Instruct
api_base: os.environ/LITELLM_API_BASE
api_key: sk-doesnt-matter
weight: 5
rpm: 5
tpm: 50000
- model_name: free
litellm_params:
model: openai/command-r-plus
api_base: https://api.cohere.com/v1/
api_key: os.environ/COHERE_API_KEY
weight: 6
rpm: 6
tpm: 60000
model_info:
context_length: 128000
- model_name: free
litellm_params:
model: openai/grok-2-latest
api_base: https://api.x.ai/v1
api_key: os.environ/XAI_API_KEY
weight: 6
rpm: 6
tpm: 60000
model_info:
context_length: 128000
- model_name: free
litellm_params:
model: groq/deepseek-r1-distill-llama-70b
api_key: os.environ/GROQ_API_KEY
weight: 6
rpm: 6
tpm: 60000
model_info:
context_length: 128000
litellm_settings:
num_retries: 3
request_timeout: 60
routing_strategy: "weighted-pick"
allowed_fails: 2 # cooldown model if it fails > 2 calls in a minute
cooldown_time: 30 # how long to cooldown model if fails/min > allowed_fails
model_list:
# --- Example -1: DeepSeek v3 ---
- model_name: DeepSeek-v3
litellm_params:
model: openai/deepseek-ai/DeepSeek-V3
#model: deepseek-ai/DeepSeek-V3
api_base: https://api.hyperbolic.xyz/v1
api_key: os.environ/HYPERBOLIC_API_KEY
rpm: 60 # 60 requests per minute
- model_name: DeepSeek-v3-DeepSeek
litellm_params:
model: deepseek/deepseek-chat
api_key: os.environ/DEEPSEEK_API_KEY
#
# --- Example 0: Gemini Flash 2.0 ---
- model_name: gemini-2.0-flash-exp
litellm_params:
model: gemini/gemini-2.0-flash-exp
api_key: os.environ/GEMINI_API_KEY
rpm: 1 # 10 requests per minute
tpm: 5000 # 5000 tokens per minute
drop_params: true
weight: 50
- model_name: gemini-2.0-flash-exp
litellm_params:
model: openrouter/google/gemini-2.0-flash-exp:free
api_key: os.environ/OPENROUTER_API_KEY
rpm: 20 # 20 requests per minute
tpm: 30000 # 30,000 tokens per minute
drop_params: true
weight: 1
# --- Example 1: Simple free load balancing ---
- model_name: Meta-Llama-3.1-405B-Instruct
litellm_params:
model: openrouter/meta-llama/llama-3.1-405b-instruct:free
api_key: os.environ/OPENROUTER_API_KEY
model_info:
context_length: 16384
- model_name: Meta-Llama-3.1-405B-Instruct
litellm_params:
model: sambanova/Meta-Llama-3.1-405B-Instruct
api_key: os.environ/SAMBANOVA_API_KEY
model_info:
context_length: 16384
# --- Example 2: Simple failover ---
- model_name: llama3.2
litellm_params:
model: ollama/hf.co/bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF:Q8_0
api_base: http://10.0.0.33:11434
timeout: 5 # seconds
stream_timeout: 3 # seconds
max_retries: 1 # attempts
- model_name: llama3.2-slow
litellm_params:
model: ollama/hf.co/bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF:Q8_0
api_base: http://10.0.0.36:11436
# fallback configuration at bottom
# --- Example 3: Load balancing free (simple-shuffle) with paid fallback ---
- model_name: Qwen2.5-Coder-32B-Instruct
litellm_params:
model: huggingface/Qwen/Qwen2.5-Coder-32B-Instruct
api_key: os.environ/HUGGINGFACE_API_KEY
additional_drop_params: ["presence_penalty", "frequency_penalty"]
- model_name: Qwen2.5-Coder-32B-Instruct
litellm_params:
model: sambanova/Qwen2.5-Coder-32B-Instruct
api_key: os.environ/SAMBANOVA_API_KEY
- model_name: Qwen2.5-Coder-32B-Instruct
litellm_params:
model: openai/hf:Qwen/Qwen2.5-Coder-32B-Instruct
api_base: https://glhf.chat/api/openai/v1
api_key: os.environ/GLHF_API_KEY
rpm: 15 # 15 requests per minute
tpm: 20000 # 20,000 tokens per minute
weight: 4 # Smaller number is less preferred
# fallback configuration at bottom
- model_name: Qwen2.5-Coder-32B-Instruct-TogetherAI
litellm_params:
model: together_ai/Qwen/Qwen2.5-Coder-32B-Instruct
api_key: os.environ/TOGETHERAI_API_KEY
api_base: https://api.together.xyz/v1
additional_drop_params: ["function_call", "functions", "tools", "tool_choice", "response_format"]
model_info:
context_length: 32768
function_calling_enabled: false
# --- Example 4: Weighted free load balancing with limits and paid fallback ---
# - model_name: Llama3.3-70B-Instruct
# litellm_params:
# model: groq/llama-3.3-70b-versatile
# api_key: os.environ/GROQ_API_KEY
# rpm: 20 # 20 requests per minute
# tpm: 30000 # 30,000 tokens per minute
# weight: 5
- model_name: Llama3.3-70B-Instruct
litellm_params:
model: sambanova/Meta-Llama-3.3-70B-Instruct
api_key: os.environ/SAMBANOVA_API_KEY
rpm: 10 # 10 requests per minute
tpm: 15000 # 10,000 tokens per minute
weight: 3
- model_name: Llama3.3-70B-Instruct
litellm_params:
model: openai/llama-3.3-70b
api_base: https://api.cerebras.ai/v1
api_key: os.environ/CEREBRAS_API_KEY
rpm: 15 # 15 requests per minute
tpm: 20000 # 20,000 tokens per minute
weight: 4
- model_name: Llama3.3-70B-Instruct
litellm_params:
model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free
api_key: os.environ/TOGETHERAI_API_KEY
api_base: https://api.together.xyz/v1
rpm: 20
tpm: 30000
weight: 5
# fallback configuration at bottom
- model_name: Llama3.3-70B-Instruct-Hyperbolic
litellm_params:
model: openai/meta-llama/Llama-3.3-70B-Instruct
api_base: https://api.hyperbolic.xyz/v1
api_key: os.environ/HYPERBOLIC_API_KEY
litellm_settings:
num_retries: 3
request_timeout: 10
routing_strategy: "weighted-pick"
allowed_fails: 1 # cooldown model if it fails > 1 calls in a minute
cooldown_time: 180 # how long to cooldown model if fails/min > allowed_fails
fallbacks: [
{"DeepSeek-v3": ["DeepSeek-v3-DeepSeek"]},
{"llama3.2": ["llama3.2-slow"]},
{"Qwen2.5-Coder-32B-Instruct": ["Qwen2.5-Coder-32B-Instruct-TogetherAI"]},
{"Llama3.3-70B-Instruct": ["Llama3.3-70B-Instruct-Hyperbolic"]},
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment