-
-
Save arbal/5cdc4a0b428ec805f724b2eddea781f5 to your computer and use it in GitHub Desktop.
LiteLLM Load Balancing with free endpoints (nested configs)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| SAMBANOVA_API_KEY= | |
| GEMINI_API_KEY= | |
| HUGGINGFACE_API_KEY= | |
| HYPERBOLIC_API_KEY= | |
| OPENAI_API_KEY= | |
| GROQ_API_KEY= | |
| XAI_API_KEY= | |
| OPENROUTER_API_KEY= | |
| COHERE_API_KEY= | |
| TOGETHERAI_API_KEY= | |
| CEREBRAS_API_KEY= | |
| GLHF_API_KEY= | |
| DEEPSEEK_API_KEY= |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| @reboot TERM=dumb bash -lc "cd ~/.litellm/ ; set -a ; source .env ; litellm --config litellm-config.yaml --host 0.0.0.0 --port 4001 --debug 2>&1 > /tmp/litellm-4001.log" | |
| @reboot TERM=dumb bash -lc "cd ~/.litellm/ ; set -a ; source .env ; LITELLM_API_BASE=http://localhost:4001/ litellm --config litellm-config-aggregate.yaml --host 0.0.0.0 --port 4002 --debug 2>&1 > /tmp/litellm-4002.log" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| model_list: | |
| - model_name: free | |
| litellm_params: | |
| model: openai/DeepSeek-V3 | |
| api_base: os.environ/LITELLM_API_BASE | |
| api_key: sk-doesnt-matter | |
| weight: 1 | |
| rpm: 1 | |
| tpm: 10000 | |
| - model_name: free | |
| litellm_params: | |
| model: openai/gemini-2.0-flash-exp | |
| api_base: os.environ/LITELLM_API_BASE | |
| api_key: sk-doesnt-matter | |
| weight: 2 | |
| rpm: 2 | |
| tpm: 20000 | |
| - model_name: free | |
| litellm_params: | |
| model: openai/Meta-Llama-3.1-405B-Instruct | |
| api_base: os.environ/LITELLM_API_BASE | |
| api_key: sk-doesnt-matter | |
| weight: 3 | |
| rpm: 3 | |
| tpm: 30000 | |
| - model_name: free | |
| litellm_params: | |
| model: openai/Qwen2.5-Coder-32B-Instruct | |
| api_base: os.environ/LITELLM_API_BASE | |
| api_key: sk-doesnt-matter | |
| weight: 4 | |
| rpm: 4 | |
| tpm: 40000 | |
| - model_name: free | |
| litellm_params: | |
| model: openai/Llama3.3-70B-Instruct | |
| api_base: os.environ/LITELLM_API_BASE | |
| api_key: sk-doesnt-matter | |
| weight: 5 | |
| rpm: 5 | |
| tpm: 50000 | |
| - model_name: free | |
| litellm_params: | |
| model: openai/command-r-plus | |
| api_base: https://api.cohere.com/v1/ | |
| api_key: os.environ/COHERE_API_KEY | |
| weight: 6 | |
| rpm: 6 | |
| tpm: 60000 | |
| model_info: | |
| context_length: 128000 | |
| - model_name: free | |
| litellm_params: | |
| model: openai/grok-2-latest | |
| api_base: https://api.x.ai/v1 | |
| api_key: os.environ/XAI_API_KEY | |
| weight: 6 | |
| rpm: 6 | |
| tpm: 60000 | |
| model_info: | |
| context_length: 128000 | |
| - model_name: free | |
| litellm_params: | |
| model: groq/deepseek-r1-distill-llama-70b | |
| api_key: os.environ/GROQ_API_KEY | |
| weight: 6 | |
| rpm: 6 | |
| tpm: 60000 | |
| model_info: | |
| context_length: 128000 | |
| litellm_settings: | |
| num_retries: 3 | |
| request_timeout: 60 | |
| routing_strategy: "weighted-pick" | |
| allowed_fails: 2 # cooldown model if it fails > 2 calls in a minute | |
| cooldown_time: 30 # how long to cooldown model if fails/min > allowed_fails |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| model_list: | |
| # --- Example -1: DeepSeek v3 --- | |
| - model_name: DeepSeek-v3 | |
| litellm_params: | |
| model: openai/deepseek-ai/DeepSeek-V3 | |
| #model: deepseek-ai/DeepSeek-V3 | |
| api_base: https://api.hyperbolic.xyz/v1 | |
| api_key: os.environ/HYPERBOLIC_API_KEY | |
| rpm: 60 # 60 requests per minute | |
| - model_name: DeepSeek-v3-DeepSeek | |
| litellm_params: | |
| model: deepseek/deepseek-chat | |
| api_key: os.environ/DEEPSEEK_API_KEY | |
| # | |
| # --- Example 0: Gemini Flash 2.0 --- | |
| - model_name: gemini-2.0-flash-exp | |
| litellm_params: | |
| model: gemini/gemini-2.0-flash-exp | |
| api_key: os.environ/GEMINI_API_KEY | |
| rpm: 1 # 10 requests per minute | |
| tpm: 5000 # 5000 tokens per minute | |
| drop_params: true | |
| weight: 50 | |
| - model_name: gemini-2.0-flash-exp | |
| litellm_params: | |
| model: openrouter/google/gemini-2.0-flash-exp:free | |
| api_key: os.environ/OPENROUTER_API_KEY | |
| rpm: 20 # 20 requests per minute | |
| tpm: 30000 # 30,000 tokens per minute | |
| drop_params: true | |
| weight: 1 | |
| # --- Example 1: Simple free load balancing --- | |
| - model_name: Meta-Llama-3.1-405B-Instruct | |
| litellm_params: | |
| model: openrouter/meta-llama/llama-3.1-405b-instruct:free | |
| api_key: os.environ/OPENROUTER_API_KEY | |
| model_info: | |
| context_length: 16384 | |
| - model_name: Meta-Llama-3.1-405B-Instruct | |
| litellm_params: | |
| model: sambanova/Meta-Llama-3.1-405B-Instruct | |
| api_key: os.environ/SAMBANOVA_API_KEY | |
| model_info: | |
| context_length: 16384 | |
| # --- Example 2: Simple failover --- | |
| - model_name: llama3.2 | |
| litellm_params: | |
| model: ollama/hf.co/bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF:Q8_0 | |
| api_base: http://10.0.0.33:11434 | |
| timeout: 5 # seconds | |
| stream_timeout: 3 # seconds | |
| max_retries: 1 # attempts | |
| - model_name: llama3.2-slow | |
| litellm_params: | |
| model: ollama/hf.co/bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF:Q8_0 | |
| api_base: http://10.0.0.36:11436 | |
| # fallback configuration at bottom | |
| # --- Example 3: Load balancing free (simple-shuffle) with paid fallback --- | |
| - model_name: Qwen2.5-Coder-32B-Instruct | |
| litellm_params: | |
| model: huggingface/Qwen/Qwen2.5-Coder-32B-Instruct | |
| api_key: os.environ/HUGGINGFACE_API_KEY | |
| additional_drop_params: ["presence_penalty", "frequency_penalty"] | |
| - model_name: Qwen2.5-Coder-32B-Instruct | |
| litellm_params: | |
| model: sambanova/Qwen2.5-Coder-32B-Instruct | |
| api_key: os.environ/SAMBANOVA_API_KEY | |
| - model_name: Qwen2.5-Coder-32B-Instruct | |
| litellm_params: | |
| model: openai/hf:Qwen/Qwen2.5-Coder-32B-Instruct | |
| api_base: https://glhf.chat/api/openai/v1 | |
| api_key: os.environ/GLHF_API_KEY | |
| rpm: 15 # 15 requests per minute | |
| tpm: 20000 # 20,000 tokens per minute | |
| weight: 4 # Smaller number is less preferred | |
| # fallback configuration at bottom | |
| - model_name: Qwen2.5-Coder-32B-Instruct-TogetherAI | |
| litellm_params: | |
| model: together_ai/Qwen/Qwen2.5-Coder-32B-Instruct | |
| api_key: os.environ/TOGETHERAI_API_KEY | |
| api_base: https://api.together.xyz/v1 | |
| additional_drop_params: ["function_call", "functions", "tools", "tool_choice", "response_format"] | |
| model_info: | |
| context_length: 32768 | |
| function_calling_enabled: false | |
| # --- Example 4: Weighted free load balancing with limits and paid fallback --- | |
| # - model_name: Llama3.3-70B-Instruct | |
| # litellm_params: | |
| # model: groq/llama-3.3-70b-versatile | |
| # api_key: os.environ/GROQ_API_KEY | |
| # rpm: 20 # 20 requests per minute | |
| # tpm: 30000 # 30,000 tokens per minute | |
| # weight: 5 | |
| - model_name: Llama3.3-70B-Instruct | |
| litellm_params: | |
| model: sambanova/Meta-Llama-3.3-70B-Instruct | |
| api_key: os.environ/SAMBANOVA_API_KEY | |
| rpm: 10 # 10 requests per minute | |
| tpm: 15000 # 10,000 tokens per minute | |
| weight: 3 | |
| - model_name: Llama3.3-70B-Instruct | |
| litellm_params: | |
| model: openai/llama-3.3-70b | |
| api_base: https://api.cerebras.ai/v1 | |
| api_key: os.environ/CEREBRAS_API_KEY | |
| rpm: 15 # 15 requests per minute | |
| tpm: 20000 # 20,000 tokens per minute | |
| weight: 4 | |
| - model_name: Llama3.3-70B-Instruct | |
| litellm_params: | |
| model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free | |
| api_key: os.environ/TOGETHERAI_API_KEY | |
| api_base: https://api.together.xyz/v1 | |
| rpm: 20 | |
| tpm: 30000 | |
| weight: 5 | |
| # fallback configuration at bottom | |
| - model_name: Llama3.3-70B-Instruct-Hyperbolic | |
| litellm_params: | |
| model: openai/meta-llama/Llama-3.3-70B-Instruct | |
| api_base: https://api.hyperbolic.xyz/v1 | |
| api_key: os.environ/HYPERBOLIC_API_KEY | |
| litellm_settings: | |
| num_retries: 3 | |
| request_timeout: 10 | |
| routing_strategy: "weighted-pick" | |
| allowed_fails: 1 # cooldown model if it fails > 1 calls in a minute | |
| cooldown_time: 180 # how long to cooldown model if fails/min > allowed_fails | |
| fallbacks: [ | |
| {"DeepSeek-v3": ["DeepSeek-v3-DeepSeek"]}, | |
| {"llama3.2": ["llama3.2-slow"]}, | |
| {"Qwen2.5-Coder-32B-Instruct": ["Qwen2.5-Coder-32B-Instruct-TogetherAI"]}, | |
| {"Llama3.3-70B-Instruct": ["Llama3.3-70B-Instruct-Hyperbolic"]}, | |
| ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment