lukestanley · January 17, 2025 23:51
diff --git a/llama_cpp_server_model_swapping_proxy_middleware.py b/llama_cpp_server_model_swapping_proxy_middleware.py
 # Minimalist OpenAI API compatiable llama.cpp server dynamic models switching middleware / proxy server manages model loading and auto-shutdown..
 # Provides seamless model hot-swapping and idle shutdown while exposing llama.cpp's advanced features like speculative decoding."""
 import asyncio
 import json
 from datetime import datetime, timedelta
 import subprocess
 import aiohttp
 from aiohttp import web

 # Constants
 PROXY_PORT = 8000 # Intended as an external port
 SERVER_PORT = 8312 # Intended as an internal port
 IDLE_TIMEOUT_SECONDS = 60 * 100 # 100 minutes
 SERVER_READY_TIMEOUT_SECONDS = 90
 DEFAULT_MODEL = "/fast/Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf"
 DEFAULT_DRAFT_MODEL = "/fast/Llama-3.2-1B-Instruct-IQ4_XS.gguf"
 CHUNK_SIZE = 4096
 MAX_GPU_LAYERS = 99
 DEFAULT_CTX_SIZE = 4096
 DEFAULT_THREADS = 8

 LLAMA_SERVER_CMD = """
 /fast/llama_gpu/bin/llama-server
 --model {model}
 --model-draft {draft_model}
 --ctx-size {ctx_size}
 --threads {threads}
 --port {port}
 -fa
 -ngl {gpu_layers}
 --gpu-layers-draft {gpu_layers}
 """

 KEY_MODEL = "model"
 KEY_DRAFT_MODEL = "draft_model"
 KEY_PROCESS = "process"
 KEY_LAST_REQUEST = "last_request"
 # Shared State
 state = {
    KEY_PROCESS: None,
    KEY_LAST_REQUEST: None,
    KEY_MODEL: DEFAULT_MODEL,
    KEY_DRAFT_MODEL: DEFAULT_DRAFT_MODEL,
 }


 async def is_server_ready():
    """Check if the Llama server is ready."""
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(f"http://127.0.0.1:{SERVER_PORT}/health") as resp:
                return resp.status == 200
    except aiohttp.ClientConnectorError:
        return False


 async def wait_for_server_ready():
    """Wait for the Llama server to be ready."""
    start_time = datetime.now()
    while not await is_server_ready():
        if datetime.now() - start_time > timedelta(
            seconds=SERVER_READY_TIMEOUT_SECONDS
        ):
            raise Exception("Server failed to become ready")
        await asyncio.sleep(1)


 async def start_server(
    model=None, draft_model=None, ctx_size=DEFAULT_CTX_SIZE, threads=DEFAULT_THREADS
 ):
    """Start the Llama server."""
    model = model or state[KEY_MODEL]
    draft_model = draft_model or state[KEY_DRAFT_MODEL]

    if state[KEY_PROCESS] and (
        state[KEY_MODEL] != model or state[KEY_DRAFT_MODEL] != draft_model
    ):
        print("Stopping server to switch models...")
        await stop_server()

    if state[KEY_PROCESS] is None:
        cmd = LLAMA_SERVER_CMD.format(
            model=model,
            draft_model=draft_model,
            ctx_size=ctx_size,
            threads=threads,
            port=SERVER_PORT,
            gpu_layers=MAX_GPU_LAYERS,
        ).split()

        state[KEY_PROCESS] = await asyncio.create_subprocess_exec(
            *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        asyncio.create_task(stream_subprocess_output(state[KEY_PROCESS]))

        state[KEY_MODEL] = model
        state[KEY_DRAFT_MODEL] = draft_model
        state[KEY_LAST_REQUEST] = datetime.now()

        print(
            f"Llama server started on port {SERVER_PORT} with models: {model}, {draft_model}"
        )
        await wait_for_server_ready()


 async def stop_server():
    """Stop the Llama server."""
    if state[KEY_PROCESS]:
        print("Terminating Llama server process...")
        state[KEY_PROCESS].terminate()
        await state[KEY_PROCESS].wait()
        print("Llama server process terminated.")
        state[KEY_PROCESS] = None


 async def stream_subprocess_output(process):
    """Stream subprocess output."""

    async def stream_pipe(pipe):
        while line := await pipe.readline():
            print(line.decode(), end="")

    await asyncio.gather(stream_pipe(process.stdout), stream_pipe(process.stderr))


 async def monitor_idle_timeout():
    """Monitor server idle timeout."""
    while True:
        await asyncio.sleep(5)
        if state[KEY_LAST_REQUEST] and datetime.now() - state[
            KEY_LAST_REQUEST
        ] > timedelta(seconds=IDLE_TIMEOUT_SECONDS):
            print("Stopping server due to inactivity...")
            await stop_server()
            state[KEY_LAST_REQUEST] = None


 def adapt_request_for_llama(headers, body):
    """Adapt request for Llama.cpp server."""
    try:
        request_json = json.loads(body)
        model = request_json.pop("model", None)
        draft_model = request_json.pop("draft_model", None)
        body = json.dumps(request_json).encode()
        headers["content-type"] = "application/json"
        headers["content-length"] = str(len(body))
        return headers, body, model, draft_model
    except json.JSONDecodeError:
        return headers, body, None, None


 async def proxy_request(request):
    """Handle incoming requests using aiohttp.web."""
    try:
        # Read request body
        body = await request.read()
        headers = dict(request.headers)
        
        # Adapt request for Llama
        headers, body, model, draft_model = adapt_request_for_llama(headers, body)

        # Start the Llama Server (if needed)
        await start_server(model, draft_model)
        state[KEY_LAST_REQUEST] = datetime.now()

        # Proxy to Llama.cpp Server
        async with aiohttp.ClientSession() as session:
            async with session.request(
                method=request.method,
                url=f"http://127.0.0.1:{SERVER_PORT}{request.path}",
                headers=headers,
                data=body
            ) as llama_response:
                # Stream response from Llama server
                response = web.StreamResponse(
                    status=llama_response.status,
                    headers=llama_response.headers
                )
                await response.prepare(request)
                
                async for chunk in llama_response.content.iter_any():
                    await response.write(chunk)
                
                await response.write_eof()
                return response

    except Exception as e:
        print(f"An error occurred: {e}")
        return web.Response(status=500, text="Internal Server Error")

 async def main():
    """Main function."""
    await start_server()
    asyncio.create_task(monitor_idle_timeout())

    app = web.Application()
    app.router.add_route('*', '/{path:.*}', proxy_request)

    runner = web.AppRunner(app)
    await runner.setup()
    site = web.TCPSite(runner, '127.0.0.1', PROXY_PORT)
    
    print(f"Serving on http://127.0.0.1:{PROXY_PORT}")
    await site.start()
    
    try:
        await asyncio.Future()  # run forever
    finally:
        await runner.cleanup()

 asyncio.run(main())

 """
 Why not just use the Llama server directly?
 -That's fine until you want to switch models, or save resources when not in use.
 -It's bleeding edge and supports the latest models with speculative decoding to speed up completions.
 -It also can support JSON schema enforcement and other great features.
 Ollama is a similar project but it is more complicated and lags behind llama.cpp.
 Python-llama-cpp is a Python wrapper around llama.cpp but it is not as feature rich as llama.cpp.
 This is a minimalist server that is easy to understand and modify.
 The standard OpenAI API compatibility is implemented by Llama.cpp Server.
 The OpenAI style API is even offered by Google's Gemini API.
 """

 """
 This works but how could it be more minimalist while preserving the primary functionality?
 httpx may help. FastAPI and Flask would not.
 This is middleware.
 FastAPI is a nice wrapper around Starlette, which is a nice wrapper around anyio IIRC.
 config managment is not a big chunk of the code. Celery would be overkill.
 We don't need logging libraries.
 I want to keep everything in one file.
 """
	# Minimalist OpenAI API compatiable llama.cpp server dynamic models switching middleware / proxy server manages model loading and auto-shutdown..
	# Provides seamless model hot-swapping and idle shutdown while exposing llama.cpp's advanced features like speculative decoding."""
	import asyncio
	import json
	from datetime import datetime, timedelta
	import subprocess
	import aiohttp
	from aiohttp import web

	# Constants
	PROXY_PORT = 8000 # Intended as an external port
	SERVER_PORT = 8312 # Intended as an internal port
	IDLE_TIMEOUT_SECONDS = 60 * 100 # 100 minutes
	SERVER_READY_TIMEOUT_SECONDS = 90
	DEFAULT_MODEL = "/fast/Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf"
	DEFAULT_DRAFT_MODEL = "/fast/Llama-3.2-1B-Instruct-IQ4_XS.gguf"
	CHUNK_SIZE = 4096
	MAX_GPU_LAYERS = 99
	DEFAULT_CTX_SIZE = 4096
	DEFAULT_THREADS = 8

	LLAMA_SERVER_CMD = """
	/fast/llama_gpu/bin/llama-server
	--model {model}
	--model-draft {draft_model}
	--ctx-size {ctx_size}
	--threads {threads}
	--port {port}
	-fa
	-ngl {gpu_layers}
	--gpu-layers-draft {gpu_layers}
	"""

	KEY_MODEL = "model"
	KEY_DRAFT_MODEL = "draft_model"
	KEY_PROCESS = "process"
	KEY_LAST_REQUEST = "last_request"
	# Shared State
	state = {
	KEY_PROCESS: None,
	KEY_LAST_REQUEST: None,
	KEY_MODEL: DEFAULT_MODEL,
	KEY_DRAFT_MODEL: DEFAULT_DRAFT_MODEL,
	}


	async def is_server_ready():
	"""Check if the Llama server is ready."""
	try:
	async with aiohttp.ClientSession() as session:
	async with session.get(f"http://127.0.0.1:{SERVER_PORT}/health") as resp:
	return resp.status == 200
	except aiohttp.ClientConnectorError:
	return False


	async def wait_for_server_ready():
	"""Wait for the Llama server to be ready."""
	start_time = datetime.now()
	while not await is_server_ready():
	if datetime.now() - start_time > timedelta(
	seconds=SERVER_READY_TIMEOUT_SECONDS
	):
	raise Exception("Server failed to become ready")
	await asyncio.sleep(1)


	async def start_server(
	model=None, draft_model=None, ctx_size=DEFAULT_CTX_SIZE, threads=DEFAULT_THREADS
	):
	"""Start the Llama server."""
	model = model or state[KEY_MODEL]
	draft_model = draft_model or state[KEY_DRAFT_MODEL]

	if state[KEY_PROCESS] and (
	state[KEY_MODEL] != model or state[KEY_DRAFT_MODEL] != draft_model
	):
	print("Stopping server to switch models...")
	await stop_server()

	if state[KEY_PROCESS] is None:
	cmd = LLAMA_SERVER_CMD.format(
	model=model,
	draft_model=draft_model,
	ctx_size=ctx_size,
	threads=threads,
	port=SERVER_PORT,
	gpu_layers=MAX_GPU_LAYERS,
	).split()

	state[KEY_PROCESS] = await asyncio.create_subprocess_exec(
	*cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
	)
	asyncio.create_task(stream_subprocess_output(state[KEY_PROCESS]))

	state[KEY_MODEL] = model
	state[KEY_DRAFT_MODEL] = draft_model
	state[KEY_LAST_REQUEST] = datetime.now()

	print(
	f"Llama server started on port {SERVER_PORT} with models: {model}, {draft_model}"
	)
	await wait_for_server_ready()


	async def stop_server():
	"""Stop the Llama server."""
	if state[KEY_PROCESS]:
	print("Terminating Llama server process...")
	state[KEY_PROCESS].terminate()
	await state[KEY_PROCESS].wait()
	print("Llama server process terminated.")
	state[KEY_PROCESS] = None


	async def stream_subprocess_output(process):
	"""Stream subprocess output."""

	async def stream_pipe(pipe):
	while line := await pipe.readline():
	print(line.decode(), end="")

	await asyncio.gather(stream_pipe(process.stdout), stream_pipe(process.stderr))


	async def monitor_idle_timeout():
	"""Monitor server idle timeout."""
	while True:
	await asyncio.sleep(5)
	if state[KEY_LAST_REQUEST] and datetime.now() - state[
	KEY_LAST_REQUEST
	] > timedelta(seconds=IDLE_TIMEOUT_SECONDS):
	print("Stopping server due to inactivity...")
	await stop_server()
	state[KEY_LAST_REQUEST] = None


	def adapt_request_for_llama(headers, body):
	"""Adapt request for Llama.cpp server."""
	try:
	request_json = json.loads(body)
	model = request_json.pop("model", None)
	draft_model = request_json.pop("draft_model", None)
	body = json.dumps(request_json).encode()
	headers["content-type"] = "application/json"
	headers["content-length"] = str(len(body))
	return headers, body, model, draft_model
	except json.JSONDecodeError:
	return headers, body, None, None


	async def proxy_request(request):
	"""Handle incoming requests using aiohttp.web."""
	try:
	# Read request body
	body = await request.read()
	headers = dict(request.headers)

	# Adapt request for Llama
	headers, body, model, draft_model = adapt_request_for_llama(headers, body)

	# Start the Llama Server (if needed)
	await start_server(model, draft_model)
	state[KEY_LAST_REQUEST] = datetime.now()

	# Proxy to Llama.cpp Server
	async with aiohttp.ClientSession() as session:
	async with session.request(
	method=request.method,
	url=f"http://127.0.0.1:{SERVER_PORT}{request.path}",
	headers=headers,
	data=body
	) as llama_response:
	# Stream response from Llama server
	response = web.StreamResponse(
	status=llama_response.status,
	headers=llama_response.headers
	)
	await response.prepare(request)

	async for chunk in llama_response.content.iter_any():
	await response.write(chunk)

	await response.write_eof()
	return response

	except Exception as e:
	print(f"An error occurred: {e}")
	return web.Response(status=500, text="Internal Server Error")

	async def main():
	"""Main function."""
	await start_server()
	asyncio.create_task(monitor_idle_timeout())

	app = web.Application()
	app.router.add_route('', '/{path:.}', proxy_request)

	runner = web.AppRunner(app)
	await runner.setup()
	site = web.TCPSite(runner, '127.0.0.1', PROXY_PORT)

	print(f"Serving on http://127.0.0.1:{PROXY_PORT}")
	await site.start()

	try:
	await asyncio.Future() # run forever
	finally:
	await runner.cleanup()

	asyncio.run(main())

	"""
	Why not just use the Llama server directly?
	-That's fine until you want to switch models, or save resources when not in use.
	-It's bleeding edge and supports the latest models with speculative decoding to speed up completions.
	-It also can support JSON schema enforcement and other great features.
	Ollama is a similar project but it is more complicated and lags behind llama.cpp.
	Python-llama-cpp is a Python wrapper around llama.cpp but it is not as feature rich as llama.cpp.
	This is a minimalist server that is easy to understand and modify.
	The standard OpenAI API compatibility is implemented by Llama.cpp Server.
	The OpenAI style API is even offered by Google's Gemini API.
	"""

	"""
	This works but how could it be more minimalist while preserving the primary functionality?
	httpx may help. FastAPI and Flask would not.
	This is middleware.
	FastAPI is a nice wrapper around Starlette, which is a nice wrapper around anyio IIRC.
	config managment is not a big chunk of the code. Celery would be overkill.
	We don't need logging libraries.
	I want to keep everything in one file.
	"""