matthewhand · September 11, 2025 19:23
diff --git a/.env.example b/.env.example
 SAMBANOVA_API_KEY=
 GEMINI_API_KEY=
 HUGGINGFACE_API_KEY=
 HYPERBOLIC_API_KEY=
 OPENAI_API_KEY=
 GROQ_API_KEY=
 XAI_API_KEY=
 OPENROUTER_API_KEY=
 COHERE_API_KEY=
 TOGETHERAI_API_KEY=
 CEREBRAS_API_KEY=
 GLHF_API_KEY=
 DEEPSEEK_API_KEY=
diff --git a/crontab.example b/crontab.example
 @reboot TERM=dumb bash -lc "cd ~/.litellm/ ; set -a ; source .env ; litellm --config litellm-config.yaml --host 0.0.0.0 --port 4001 --debug 2>&1 > /tmp/litellm-4001.log"
 @reboot TERM=dumb bash -lc "cd ~/.litellm/ ; set -a ; source .env ; LITELLM_API_BASE=http://localhost:4001/ litellm --config litellm-config-aggregate.yaml --host 0.0.0.0 --port 4002 --debug 2>&1 > /tmp/litellm-4002.log"
diff --git a/litellm-config-aggregate.yaml b/litellm-config-aggregate.yaml
 model_list:

  - model_name: free
    litellm_params:
      model: openai/DeepSeek-V3
      api_base: os.environ/LITELLM_API_BASE
      api_key: sk-doesnt-matter
      weight: 1
      rpm: 1
      tpm: 10000

  - model_name: free
    litellm_params:
      model: openai/gemini-2.0-flash-exp
      api_base: os.environ/LITELLM_API_BASE
      api_key: sk-doesnt-matter
      weight: 2
      rpm: 2
      tpm: 20000

  - model_name: free
    litellm_params:
      model: openai/Meta-Llama-3.1-405B-Instruct
      api_base: os.environ/LITELLM_API_BASE
      api_key: sk-doesnt-matter
      weight: 3
      rpm: 3
      tpm: 30000

  - model_name: free
    litellm_params:
      model: openai/Qwen2.5-Coder-32B-Instruct
      api_base: os.environ/LITELLM_API_BASE
      api_key: sk-doesnt-matter
      weight: 4
      rpm: 4
      tpm: 40000

  - model_name: free
    litellm_params:
      model: openai/Llama3.3-70B-Instruct
      api_base: os.environ/LITELLM_API_BASE
      api_key: sk-doesnt-matter
      weight: 5
      rpm: 5
      tpm: 50000

  - model_name: free
    litellm_params:
      model: openai/command-r-plus
      api_base: https://api.cohere.com/v1/
      api_key: os.environ/COHERE_API_KEY
      weight: 6
      rpm: 6
      tpm: 60000
    model_info:
      context_length: 128000

  - model_name: free
    litellm_params:
      model: openai/grok-2-latest
      api_base: https://api.x.ai/v1
      api_key: os.environ/XAI_API_KEY
      weight: 6
      rpm: 6
      tpm: 60000
    model_info:
      context_length: 128000

  - model_name: free
    litellm_params:
      model: groq/deepseek-r1-distill-llama-70b
      api_key: os.environ/GROQ_API_KEY
      weight: 6
      rpm: 6
      tpm: 60000
    model_info:
      context_length: 128000

 litellm_settings:
  num_retries: 3
  request_timeout: 60
  routing_strategy: "weighted-pick"
  allowed_fails: 2  # cooldown model if it fails > 2 calls in a minute
  cooldown_time: 30  # how long to cooldown model if fails/min > allowed_fails
diff --git a/litellm-config.yaml b/litellm-config.yaml
 model_list:
 # --- Example -1: DeepSeek v3 ---

  - model_name: DeepSeek-v3
    litellm_params:
      model: openai/deepseek-ai/DeepSeek-V3
      #model: deepseek-ai/DeepSeek-V3
      api_base: https://api.hyperbolic.xyz/v1
      api_key: os.environ/HYPERBOLIC_API_KEY
      rpm: 60    # 60 requests per minute

  - model_name: DeepSeek-v3-DeepSeek
    litellm_params:
      model: deepseek/deepseek-chat
      api_key: os.environ/DEEPSEEK_API_KEY

 #
 # --- Example 0: Gemini Flash 2.0 ---
  - model_name: gemini-2.0-flash-exp
    litellm_params:
      model: gemini/gemini-2.0-flash-exp
      api_key: os.environ/GEMINI_API_KEY
      rpm: 1    # 10 requests per minute
      tpm: 5000 # 5000 tokens per minute
      drop_params: true
      weight: 50

  - model_name: gemini-2.0-flash-exp
    litellm_params:
      model: openrouter/google/gemini-2.0-flash-exp:free
      api_key: os.environ/OPENROUTER_API_KEY
      rpm: 20    # 20 requests per minute
      tpm: 30000 # 30,000 tokens per minute
      drop_params: true
      weight: 1

 # --- Example 1: Simple free load balancing ---
  - model_name: Meta-Llama-3.1-405B-Instruct
    litellm_params:
      model: openrouter/meta-llama/llama-3.1-405b-instruct:free
      api_key: os.environ/OPENROUTER_API_KEY
    model_info:
      context_length: 16384

  - model_name: Meta-Llama-3.1-405B-Instruct
    litellm_params:
      model: sambanova/Meta-Llama-3.1-405B-Instruct
      api_key: os.environ/SAMBANOVA_API_KEY
    model_info:
      context_length: 16384

 # --- Example 2: Simple failover ---
  - model_name: llama3.2
    litellm_params:
      model: ollama/hf.co/bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF:Q8_0
      api_base: http://10.0.0.33:11434
      timeout:  5        # seconds
      stream_timeout: 3  # seconds
      max_retries: 1     # attempts

  - model_name: llama3.2-slow
    litellm_params:
      model: ollama/hf.co/bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF:Q8_0
      api_base: http://10.0.0.36:11436

  # fallback configuration at bottom

 # --- Example 3: Load balancing free (simple-shuffle) with paid fallback ---
  - model_name: Qwen2.5-Coder-32B-Instruct
    litellm_params:
      model: huggingface/Qwen/Qwen2.5-Coder-32B-Instruct
      api_key: os.environ/HUGGINGFACE_API_KEY
      additional_drop_params: ["presence_penalty", "frequency_penalty"]

  - model_name: Qwen2.5-Coder-32B-Instruct
    litellm_params:
      model: sambanova/Qwen2.5-Coder-32B-Instruct
      api_key: os.environ/SAMBANOVA_API_KEY

  - model_name: Qwen2.5-Coder-32B-Instruct
    litellm_params:
      model: openai/hf:Qwen/Qwen2.5-Coder-32B-Instruct
      api_base: https://glhf.chat/api/openai/v1
      api_key: os.environ/GLHF_API_KEY
      rpm: 15    # 15 requests per minute
      tpm: 20000 # 20,000 tokens per minute
      weight: 4  # Smaller number is less preferred

  # fallback configuration at bottom
  - model_name: Qwen2.5-Coder-32B-Instruct-TogetherAI
    litellm_params:
      model: together_ai/Qwen/Qwen2.5-Coder-32B-Instruct
      api_key: os.environ/TOGETHERAI_API_KEY
      api_base: https://api.together.xyz/v1
      additional_drop_params: ["function_call", "functions", "tools", "tool_choice", "response_format"]
    model_info:
      context_length: 32768
      function_calling_enabled: false

 # --- Example 4: Weighted free load balancing with limits and paid fallback ---
 #  - model_name: Llama3.3-70B-Instruct
 #    litellm_params:
 #      model: groq/llama-3.3-70b-versatile
 #      api_key: os.environ/GROQ_API_KEY
 #      rpm: 20    # 20 requests per minute
 #      tpm: 30000 # 30,000 tokens per minute
 #      weight: 5

  - model_name: Llama3.3-70B-Instruct
    litellm_params:
      model: sambanova/Meta-Llama-3.3-70B-Instruct
      api_key: os.environ/SAMBANOVA_API_KEY
      rpm: 10    # 10 requests per minute
      tpm: 15000 # 10,000 tokens per minute
      weight: 3

  - model_name: Llama3.3-70B-Instruct
    litellm_params:
      model: openai/llama-3.3-70b
      api_base: https://api.cerebras.ai/v1
      api_key: os.environ/CEREBRAS_API_KEY
      rpm: 15    # 15 requests per minute
      tpm: 20000 # 20,000 tokens per minute
      weight: 4

  - model_name: Llama3.3-70B-Instruct
    litellm_params:
      model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free
      api_key: os.environ/TOGETHERAI_API_KEY
      api_base: https://api.together.xyz/v1
      rpm: 20
      tpm: 30000
      weight: 5

  # fallback configuration at bottom
  - model_name: Llama3.3-70B-Instruct-Hyperbolic
    litellm_params:
      model: openai/meta-llama/Llama-3.3-70B-Instruct
      api_base: https://api.hyperbolic.xyz/v1
      api_key: os.environ/HYPERBOLIC_API_KEY

 litellm_settings:
  num_retries: 3
  request_timeout: 10
  routing_strategy: "weighted-pick"
  allowed_fails: 1  # cooldown model if it fails > 1 calls in a minute
  cooldown_time: 180  # how long to cooldown model if fails/min > allowed_fails
  fallbacks: [
    {"DeepSeek-v3": ["DeepSeek-v3-DeepSeek"]},
    {"llama3.2": ["llama3.2-slow"]},
    {"Qwen2.5-Coder-32B-Instruct": ["Qwen2.5-Coder-32B-Instruct-TogetherAI"]},
    {"Llama3.3-70B-Instruct": ["Llama3.3-70B-Instruct-Hyperbolic"]},
  ]
	SAMBANOVA_API_KEY=
	GEMINI_API_KEY=
	HUGGINGFACE_API_KEY=
	HYPERBOLIC_API_KEY=
	OPENAI_API_KEY=
	GROQ_API_KEY=
	XAI_API_KEY=
	OPENROUTER_API_KEY=
	COHERE_API_KEY=
	TOGETHERAI_API_KEY=
	CEREBRAS_API_KEY=
	GLHF_API_KEY=
	DEEPSEEK_API_KEY=
	@reboot TERM=dumb bash -lc "cd ~/.litellm/ ; set -a ; source .env ; litellm --config litellm-config.yaml --host 0.0.0.0 --port 4001 --debug 2>&1 > /tmp/litellm-4001.log"
	@reboot TERM=dumb bash -lc "cd ~/.litellm/ ; set -a ; source .env ; LITELLM_API_BASE=http://localhost:4001/ litellm --config litellm-config-aggregate.yaml --host 0.0.0.0 --port 4002 --debug 2>&1 > /tmp/litellm-4002.log"
	model_list:

	- model_name: free
	litellm_params:
	model: openai/DeepSeek-V3
	api_base: os.environ/LITELLM_API_BASE
	api_key: sk-doesnt-matter
	weight: 1
	rpm: 1
	tpm: 10000

	- model_name: free
	litellm_params:
	model: openai/gemini-2.0-flash-exp
	api_base: os.environ/LITELLM_API_BASE
	api_key: sk-doesnt-matter
	weight: 2
	rpm: 2
	tpm: 20000

	- model_name: free
	litellm_params:
	model: openai/Meta-Llama-3.1-405B-Instruct
	api_base: os.environ/LITELLM_API_BASE
	api_key: sk-doesnt-matter
	weight: 3
	rpm: 3
	tpm: 30000

	- model_name: free
	litellm_params:
	model: openai/Qwen2.5-Coder-32B-Instruct
	api_base: os.environ/LITELLM_API_BASE
	api_key: sk-doesnt-matter
	weight: 4
	rpm: 4
	tpm: 40000

	- model_name: free
	litellm_params:
	model: openai/Llama3.3-70B-Instruct
	api_base: os.environ/LITELLM_API_BASE
	api_key: sk-doesnt-matter
	weight: 5
	rpm: 5
	tpm: 50000

	- model_name: free
	litellm_params:
	model: openai/command-r-plus
	api_base: https://api.cohere.com/v1/
	api_key: os.environ/COHERE_API_KEY
	weight: 6
	rpm: 6
	tpm: 60000
	model_info:
	context_length: 128000

	- model_name: free
	litellm_params:
	model: openai/grok-2-latest
	api_base: https://api.x.ai/v1
	api_key: os.environ/XAI_API_KEY
	weight: 6
	rpm: 6
	tpm: 60000
	model_info:
	context_length: 128000

	- model_name: free
	litellm_params:
	model: groq/deepseek-r1-distill-llama-70b
	api_key: os.environ/GROQ_API_KEY
	weight: 6
	rpm: 6
	tpm: 60000
	model_info:
	context_length: 128000

	litellm_settings:
	num_retries: 3
	request_timeout: 60
	routing_strategy: "weighted-pick"
	allowed_fails: 2 # cooldown model if it fails > 2 calls in a minute
	cooldown_time: 30 # how long to cooldown model if fails/min > allowed_fails
	model_list:
	# --- Example -1: DeepSeek v3 ---

	- model_name: DeepSeek-v3
	litellm_params:
	model: openai/deepseek-ai/DeepSeek-V3
	#model: deepseek-ai/DeepSeek-V3
	api_base: https://api.hyperbolic.xyz/v1
	api_key: os.environ/HYPERBOLIC_API_KEY
	rpm: 60 # 60 requests per minute

	- model_name: DeepSeek-v3-DeepSeek
	litellm_params:
	model: deepseek/deepseek-chat
	api_key: os.environ/DEEPSEEK_API_KEY

	#
	# --- Example 0: Gemini Flash 2.0 ---
	- model_name: gemini-2.0-flash-exp
	litellm_params:
	model: gemini/gemini-2.0-flash-exp
	api_key: os.environ/GEMINI_API_KEY
	rpm: 1 # 10 requests per minute
	tpm: 5000 # 5000 tokens per minute
	drop_params: true
	weight: 50

	- model_name: gemini-2.0-flash-exp
	litellm_params:
	model: openrouter/google/gemini-2.0-flash-exp:free
	api_key: os.environ/OPENROUTER_API_KEY
	rpm: 20 # 20 requests per minute
	tpm: 30000 # 30,000 tokens per minute
	drop_params: true
	weight: 1

	# --- Example 1: Simple free load balancing ---
	- model_name: Meta-Llama-3.1-405B-Instruct
	litellm_params:
	model: openrouter/meta-llama/llama-3.1-405b-instruct:free
	api_key: os.environ/OPENROUTER_API_KEY
	model_info:
	context_length: 16384

	- model_name: Meta-Llama-3.1-405B-Instruct
	litellm_params:
	model: sambanova/Meta-Llama-3.1-405B-Instruct
	api_key: os.environ/SAMBANOVA_API_KEY
	model_info:
	context_length: 16384

	# --- Example 2: Simple failover ---
	- model_name: llama3.2
	litellm_params:
	model: ollama/hf.co/bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF:Q8_0
	api_base: http://10.0.0.33:11434
	timeout: 5 # seconds
	stream_timeout: 3 # seconds
	max_retries: 1 # attempts

	- model_name: llama3.2-slow
	litellm_params:
	model: ollama/hf.co/bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF:Q8_0
	api_base: http://10.0.0.36:11436

	# fallback configuration at bottom

	# --- Example 3: Load balancing free (simple-shuffle) with paid fallback ---
	- model_name: Qwen2.5-Coder-32B-Instruct
	litellm_params:
	model: huggingface/Qwen/Qwen2.5-Coder-32B-Instruct
	api_key: os.environ/HUGGINGFACE_API_KEY
	additional_drop_params: ["presence_penalty", "frequency_penalty"]

	- model_name: Qwen2.5-Coder-32B-Instruct
	litellm_params:
	model: sambanova/Qwen2.5-Coder-32B-Instruct
	api_key: os.environ/SAMBANOVA_API_KEY

	- model_name: Qwen2.5-Coder-32B-Instruct
	litellm_params:
	model: openai/hf:Qwen/Qwen2.5-Coder-32B-Instruct
	api_base: https://glhf.chat/api/openai/v1
	api_key: os.environ/GLHF_API_KEY
	rpm: 15 # 15 requests per minute
	tpm: 20000 # 20,000 tokens per minute
	weight: 4 # Smaller number is less preferred

	# fallback configuration at bottom
	- model_name: Qwen2.5-Coder-32B-Instruct-TogetherAI
	litellm_params:
	model: together_ai/Qwen/Qwen2.5-Coder-32B-Instruct
	api_key: os.environ/TOGETHERAI_API_KEY
	api_base: https://api.together.xyz/v1
	additional_drop_params: ["function_call", "functions", "tools", "tool_choice", "response_format"]
	model_info:
	context_length: 32768
	function_calling_enabled: false

	# --- Example 4: Weighted free load balancing with limits and paid fallback ---
	# - model_name: Llama3.3-70B-Instruct
	# litellm_params:
	# model: groq/llama-3.3-70b-versatile
	# api_key: os.environ/GROQ_API_KEY
	# rpm: 20 # 20 requests per minute
	# tpm: 30000 # 30,000 tokens per minute
	# weight: 5

	- model_name: Llama3.3-70B-Instruct
	litellm_params:
	model: sambanova/Meta-Llama-3.3-70B-Instruct
	api_key: os.environ/SAMBANOVA_API_KEY
	rpm: 10 # 10 requests per minute
	tpm: 15000 # 10,000 tokens per minute
	weight: 3

	- model_name: Llama3.3-70B-Instruct
	litellm_params:
	model: openai/llama-3.3-70b
	api_base: https://api.cerebras.ai/v1
	api_key: os.environ/CEREBRAS_API_KEY
	rpm: 15 # 15 requests per minute
	tpm: 20000 # 20,000 tokens per minute
	weight: 4

	- model_name: Llama3.3-70B-Instruct
	litellm_params:
	model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free
	api_key: os.environ/TOGETHERAI_API_KEY
	api_base: https://api.together.xyz/v1
	rpm: 20
	tpm: 30000
	weight: 5

	# fallback configuration at bottom
	- model_name: Llama3.3-70B-Instruct-Hyperbolic
	litellm_params:
	model: openai/meta-llama/Llama-3.3-70B-Instruct
	api_base: https://api.hyperbolic.xyz/v1
	api_key: os.environ/HYPERBOLIC_API_KEY

	litellm_settings:
	num_retries: 3
	request_timeout: 10
	routing_strategy: "weighted-pick"
	allowed_fails: 1 # cooldown model if it fails > 1 calls in a minute
	cooldown_time: 180 # how long to cooldown model if fails/min > allowed_fails
	fallbacks: [
	{"DeepSeek-v3": ["DeepSeek-v3-DeepSeek"]},
	{"llama3.2": ["llama3.2-slow"]},
	{"Qwen2.5-Coder-32B-Instruct": ["Qwen2.5-Coder-32B-Instruct-TogetherAI"]},
	{"Llama3.3-70B-Instruct": ["Llama3.3-70B-Instruct-Hyperbolic"]},
	]