Last active
September 11, 2025 19:23
-
-
Save matthewhand/1040ad1fffa3eb3451097877fbe9938a to your computer and use it in GitHub Desktop.
LiteLLM Load Balancing with free endpoints (nested configs)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| SAMBANOVA_API_KEY= | |
| GEMINI_API_KEY= | |
| HUGGINGFACE_API_KEY= | |
| HYPERBOLIC_API_KEY= | |
| OPENAI_API_KEY= | |
| GROQ_API_KEY= | |
| XAI_API_KEY= | |
| OPENROUTER_API_KEY= | |
| COHERE_API_KEY= | |
| TOGETHERAI_API_KEY= | |
| CEREBRAS_API_KEY= | |
| GLHF_API_KEY= | |
| DEEPSEEK_API_KEY= |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| @reboot TERM=dumb bash -lc "cd ~/.litellm/ ; set -a ; source .env ; litellm --config litellm-config.yaml --host 0.0.0.0 --port 4001 --debug 2>&1 > /tmp/litellm-4001.log" | |
| @reboot TERM=dumb bash -lc "cd ~/.litellm/ ; set -a ; source .env ; LITELLM_API_BASE=http://localhost:4001/ litellm --config litellm-config-aggregate.yaml --host 0.0.0.0 --port 4002 --debug 2>&1 > /tmp/litellm-4002.log" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| model_list: | |
| - model_name: free | |
| litellm_params: | |
| model: openai/DeepSeek-V3 | |
| api_base: os.environ/LITELLM_API_BASE | |
| api_key: sk-doesnt-matter | |
| weight: 1 | |
| rpm: 1 | |
| tpm: 10000 | |
| - model_name: free | |
| litellm_params: | |
| model: openai/gemini-2.0-flash-exp | |
| api_base: os.environ/LITELLM_API_BASE | |
| api_key: sk-doesnt-matter | |
| weight: 2 | |
| rpm: 2 | |
| tpm: 20000 | |
| - model_name: free | |
| litellm_params: | |
| model: openai/Meta-Llama-3.1-405B-Instruct | |
| api_base: os.environ/LITELLM_API_BASE | |
| api_key: sk-doesnt-matter | |
| weight: 3 | |
| rpm: 3 | |
| tpm: 30000 | |
| - model_name: free | |
| litellm_params: | |
| model: openai/Qwen2.5-Coder-32B-Instruct | |
| api_base: os.environ/LITELLM_API_BASE | |
| api_key: sk-doesnt-matter | |
| weight: 4 | |
| rpm: 4 | |
| tpm: 40000 | |
| - model_name: free | |
| litellm_params: | |
| model: openai/Llama3.3-70B-Instruct | |
| api_base: os.environ/LITELLM_API_BASE | |
| api_key: sk-doesnt-matter | |
| weight: 5 | |
| rpm: 5 | |
| tpm: 50000 | |
| - model_name: free | |
| litellm_params: | |
| model: openai/command-r-plus | |
| api_base: https://api.cohere.com/v1/ | |
| api_key: os.environ/COHERE_API_KEY | |
| weight: 6 | |
| rpm: 6 | |
| tpm: 60000 | |
| model_info: | |
| context_length: 128000 | |
| - model_name: free | |
| litellm_params: | |
| model: openai/grok-2-latest | |
| api_base: https://api.x.ai/v1 | |
| api_key: os.environ/XAI_API_KEY | |
| weight: 6 | |
| rpm: 6 | |
| tpm: 60000 | |
| model_info: | |
| context_length: 128000 | |
| - model_name: free | |
| litellm_params: | |
| model: groq/deepseek-r1-distill-llama-70b | |
| api_key: os.environ/GROQ_API_KEY | |
| weight: 6 | |
| rpm: 6 | |
| tpm: 60000 | |
| model_info: | |
| context_length: 128000 | |
| litellm_settings: | |
| num_retries: 3 | |
| request_timeout: 60 | |
| routing_strategy: "weighted-pick" | |
| allowed_fails: 2 # cooldown model if it fails > 2 calls in a minute | |
| cooldown_time: 30 # how long to cooldown model if fails/min > allowed_fails |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| model_list: | |
| # --- Example -1: DeepSeek v3 --- | |
| - model_name: DeepSeek-v3 | |
| litellm_params: | |
| model: openai/deepseek-ai/DeepSeek-V3 | |
| #model: deepseek-ai/DeepSeek-V3 | |
| api_base: https://api.hyperbolic.xyz/v1 | |
| api_key: os.environ/HYPERBOLIC_API_KEY | |
| rpm: 60 # 60 requests per minute | |
| - model_name: DeepSeek-v3-DeepSeek | |
| litellm_params: | |
| model: deepseek/deepseek-chat | |
| api_key: os.environ/DEEPSEEK_API_KEY | |
| # | |
| # --- Example 0: Gemini Flash 2.0 --- | |
| - model_name: gemini-2.0-flash-exp | |
| litellm_params: | |
| model: gemini/gemini-2.0-flash-exp | |
| api_key: os.environ/GEMINI_API_KEY | |
| rpm: 1 # 10 requests per minute | |
| tpm: 5000 # 5000 tokens per minute | |
| drop_params: true | |
| weight: 50 | |
| - model_name: gemini-2.0-flash-exp | |
| litellm_params: | |
| model: openrouter/google/gemini-2.0-flash-exp:free | |
| api_key: os.environ/OPENROUTER_API_KEY | |
| rpm: 20 # 20 requests per minute | |
| tpm: 30000 # 30,000 tokens per minute | |
| drop_params: true | |
| weight: 1 | |
| # --- Example 1: Simple free load balancing --- | |
| - model_name: Meta-Llama-3.1-405B-Instruct | |
| litellm_params: | |
| model: openrouter/meta-llama/llama-3.1-405b-instruct:free | |
| api_key: os.environ/OPENROUTER_API_KEY | |
| model_info: | |
| context_length: 16384 | |
| - model_name: Meta-Llama-3.1-405B-Instruct | |
| litellm_params: | |
| model: sambanova/Meta-Llama-3.1-405B-Instruct | |
| api_key: os.environ/SAMBANOVA_API_KEY | |
| model_info: | |
| context_length: 16384 | |
| # --- Example 2: Simple failover --- | |
| - model_name: llama3.2 | |
| litellm_params: | |
| model: ollama/hf.co/bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF:Q8_0 | |
| api_base: http://10.0.0.33:11434 | |
| timeout: 5 # seconds | |
| stream_timeout: 3 # seconds | |
| max_retries: 1 # attempts | |
| - model_name: llama3.2-slow | |
| litellm_params: | |
| model: ollama/hf.co/bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF:Q8_0 | |
| api_base: http://10.0.0.36:11436 | |
| # fallback configuration at bottom | |
| # --- Example 3: Load balancing free (simple-shuffle) with paid fallback --- | |
| - model_name: Qwen2.5-Coder-32B-Instruct | |
| litellm_params: | |
| model: huggingface/Qwen/Qwen2.5-Coder-32B-Instruct | |
| api_key: os.environ/HUGGINGFACE_API_KEY | |
| additional_drop_params: ["presence_penalty", "frequency_penalty"] | |
| - model_name: Qwen2.5-Coder-32B-Instruct | |
| litellm_params: | |
| model: sambanova/Qwen2.5-Coder-32B-Instruct | |
| api_key: os.environ/SAMBANOVA_API_KEY | |
| - model_name: Qwen2.5-Coder-32B-Instruct | |
| litellm_params: | |
| model: openai/hf:Qwen/Qwen2.5-Coder-32B-Instruct | |
| api_base: https://glhf.chat/api/openai/v1 | |
| api_key: os.environ/GLHF_API_KEY | |
| rpm: 15 # 15 requests per minute | |
| tpm: 20000 # 20,000 tokens per minute | |
| weight: 4 # Smaller number is less preferred | |
| # fallback configuration at bottom | |
| - model_name: Qwen2.5-Coder-32B-Instruct-TogetherAI | |
| litellm_params: | |
| model: together_ai/Qwen/Qwen2.5-Coder-32B-Instruct | |
| api_key: os.environ/TOGETHERAI_API_KEY | |
| api_base: https://api.together.xyz/v1 | |
| additional_drop_params: ["function_call", "functions", "tools", "tool_choice", "response_format"] | |
| model_info: | |
| context_length: 32768 | |
| function_calling_enabled: false | |
| # --- Example 4: Weighted free load balancing with limits and paid fallback --- | |
| # - model_name: Llama3.3-70B-Instruct | |
| # litellm_params: | |
| # model: groq/llama-3.3-70b-versatile | |
| # api_key: os.environ/GROQ_API_KEY | |
| # rpm: 20 # 20 requests per minute | |
| # tpm: 30000 # 30,000 tokens per minute | |
| # weight: 5 | |
| - model_name: Llama3.3-70B-Instruct | |
| litellm_params: | |
| model: sambanova/Meta-Llama-3.3-70B-Instruct | |
| api_key: os.environ/SAMBANOVA_API_KEY | |
| rpm: 10 # 10 requests per minute | |
| tpm: 15000 # 10,000 tokens per minute | |
| weight: 3 | |
| - model_name: Llama3.3-70B-Instruct | |
| litellm_params: | |
| model: openai/llama-3.3-70b | |
| api_base: https://api.cerebras.ai/v1 | |
| api_key: os.environ/CEREBRAS_API_KEY | |
| rpm: 15 # 15 requests per minute | |
| tpm: 20000 # 20,000 tokens per minute | |
| weight: 4 | |
| - model_name: Llama3.3-70B-Instruct | |
| litellm_params: | |
| model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free | |
| api_key: os.environ/TOGETHERAI_API_KEY | |
| api_base: https://api.together.xyz/v1 | |
| rpm: 20 | |
| tpm: 30000 | |
| weight: 5 | |
| # fallback configuration at bottom | |
| - model_name: Llama3.3-70B-Instruct-Hyperbolic | |
| litellm_params: | |
| model: openai/meta-llama/Llama-3.3-70B-Instruct | |
| api_base: https://api.hyperbolic.xyz/v1 | |
| api_key: os.environ/HYPERBOLIC_API_KEY | |
| litellm_settings: | |
| num_retries: 3 | |
| request_timeout: 10 | |
| routing_strategy: "weighted-pick" | |
| allowed_fails: 1 # cooldown model if it fails > 1 calls in a minute | |
| cooldown_time: 180 # how long to cooldown model if fails/min > allowed_fails | |
| fallbacks: [ | |
| {"DeepSeek-v3": ["DeepSeek-v3-DeepSeek"]}, | |
| {"llama3.2": ["llama3.2-slow"]}, | |
| {"Qwen2.5-Coder-32B-Instruct": ["Qwen2.5-Coder-32B-Instruct-TogetherAI"]}, | |
| {"Llama3.3-70B-Instruct": ["Llama3.3-70B-Instruct-Hyperbolic"]}, | |
| ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment