Skip to content

Instantly share code, notes, and snippets.

@ochafik
Last active June 5, 2024 16:49
Show Gist options
  • Select an option

  • Save ochafik/f6b51c95d7be2357bfdb22ce43faf01e to your computer and use it in GitHub Desktop.

Select an option

Save ochafik/f6b51c95d7be2357bfdb22ce43faf01e to your computer and use it in GitHub Desktop.
Multiple llama.cpp models under same endpoint

Runs multiple models in parallel under the same endpoint.

Prerequisites:

npm i -g pm2
pip install "fastapi[all]" httpx uvicorn

git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
make -j LLAMA_CURL=1 server

# Or if you're a lazy Mac user:
#   brew install llama.cpp

Then download multi.py below & start one server per model with the following command:

pm2 start --name llama.cpp multi.py -- --models='{
  "completions": {
    "phi-3-medium-128k-instruct": [
      "--hf-repo", "bartowski/Phi-3-medium-128k-instruct-GGUF",
      "--hf-file", "Phi-3-medium-128k-instruct-Q8_0.gguf",
      "-np", "4"
    ],
    "default": "phi-3-medium-128k-instruct"
  },
  "infill": {
    "codestral-22B-v0.1": {
      "--hf-repo": "bartowski/Codestral-22B-v0.1-GGUF",
      "--hf-file": "Codestral-22B-v0.1-Q8_0.gguf"
    },
    "default": "codestral-22B-v0.1"
  },
  "embeddings": {
    "nomic-embed-text-v1.5": [
      "--hf-repo", "nomic-ai/nomic-embed-text-v1.5-GGUF",
      "--hf-file", "nomic-embed-text-v1.5.Q4_K_M.gguf",
      "--rope-freq-scale", "0.75",
      "--embeddings",
      "-np", "16"
    ],
    "default": "nomic-embed-text-v1.5"
  }
}'

Some useful commands:

  • pm2 ls to check memory usage & status of all servers
  • pm2 logs to see what's going on
  • pm2 stop all to stop everything
  • pm2 startup (+ pm2 save) to have servers automatically restart on reboot

And query the server under the same umbrella endpoint:

curl -X POST http://localhost:8000/v1/completions -d '{
  "model": "Phi-3-medium-128k-instruct",
  "prompt": "Hello, world!",
  "stream": true
}'
curl -X POST http://localhost:8000/v1/embeddings -d '{
  "model": "nomic-embed-text-v1.5",
  "input": "Hello, world!"
}'

Or in Python:

from openai import OpenAI

client = OpenAI(base_url="http://localhost:8000/v1")

print(client.embeddings.create(
  model="text-embedding-3-small",
  input=["Hello, World"]
).data[0].embedding)

print(client.chat.completions.create(
  model="gpt-4o",
  response_format={ "type": "json_object" },
  messages=[
    {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
    {"role": "user", "content": "Who won the world series in 2020?"}
  ]
).choices[0].message.content)
# Copyright 2024 Google LLC.
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
import argparse
import httpx
import json
import subprocess
from starlette.requests import Request
from starlette.responses import StreamingResponse
from starlette.background import BackgroundTask
import uvicorn
import shutil
DEFAULT_SERVER_ARGS = ["-fa", "-c", "0", "--metrics"]
if shutil.which("llama-server"):
DEFAULT_SERVER_BIN = "llama-server"
elif shutil.which(sb := Path(__file__).parent / "server"):
DEFAULT_SERVER_BIN = sb
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="FastAPI Proxy Server")
parser.add_argument("--host", default="0.0.0.0", help="Host address (default: 0.0.0.0)")
parser.add_argument("--port", type=int, default=8000, help="Port number (default: 8000)")
parser.add_argument("--models", type=json.loads, required=True, help="Model config (map of model name to llama.cpp args such as --hf-repo, --hf-file, etc.)")
parser.add_argument("--server-bin", type=str, default=DEFAULT_SERVER_BIN, help="Path to llama-server binary (default: %s)" % DEFAULT_SERVER_BIN)
parser.add_argument("--server-args", type=json.loads, default=DEFAULT_SERVER_ARGS, help="Extra server args (default: %s)" % " ".join(json.dumps(DEFAULT_SERVER_ARGS)))
args = parser.parse_args()
all_routes = {
"completions": {},
"embeddings": {},
}
next_port = args.port + 1
for kind, models in args.models.items():
for model_name, model_args in models.items():
if isinstance(model_args, str):
# Name alias
route = all_routes[kind][model_args]
all_routes[kind][model_name] = route
continue
server_port = next_port
next_port += 1
all_routes[kind][model_name] = f"http://localhost:{server_port}"
try: subprocess.check_call(["pm2", "delete", model_name])
except: pass
command = ["pm2", "start", "--name", model_name, args.server_bin, "--", "--port", str(server_port), *model_args, *args.server_args]
print(command)
subprocess.check_call(command)
clients = {
route: httpx.AsyncClient(base_url=route, timeout=None)
for route in set(
route
for routes in all_routes.values()
for route in routes.values()
)
}
app = FastAPI()
async def forward_call(request: Request):
request_body = await request.body()
kind = request.url.path.split("/")[-1]
routes = all_routes[kind]
models = args.models[kind]
model = json.loads(request_body).get("model")
# Resolve model aliases, w/ optional default.
if model not in models and "default" in models:
model = models["default"]
while model in models and isinstance(models[model], str):
model = models[model]
if model not in routes:
raise HTTPException(
status_code=400,
detail=f"Invalid model ({model}). Must be set to one of " + ", ".join(args.models.keys()))
client = clients[routes[model]]
url = httpx.URL(path=request.url.path, query=request.url.query.encode("utf-8"))
req = client.build_request(request.method, url, headers=request.headers.raw, content=request_body)
res = await client.send(req, stream=True)
return StreamingResponse(res.aiter_raw(), status_code=res.status_code, headers=res.headers, background=BackgroundTask(res.aclose))
for path in ["/v1/embeddings", "/v1/completions", "/v1/chat/completions"]:
app.add_api_route(path, forward_call, methods=["POST"])
uvicorn.run(app, host=args.host, port=args.port)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment