sasasin · June 2, 2026 16:24
diff --git a/opencode.jsonc b/opencode.jsonc
 {
  "$schema": "https://opencode.ai/config.json",

  // llama-server (local) provider — OpenAI-compatible endpoint
  "provider": {
    "llamacpp": {
      "npm": "@ai-sdk/openai-compatible",
      "name": "llama-server (local)",
      "options": {
        "baseURL": "http://127.0.0.1:8081/v1"
      },
      "models": {
        "gemma-4-e4b": {
          "name": "Gemma 4 E4B (Q4_K_S, local)",
          "limit": {
            "context": 131072,
            "output": 8192
          }
        }
      }
    }
  },

  // Default model for all agents unless overridden below
  "model": "llamacpp/gemma-4-e4b",

  "agent": {
    // plan: thinking ON
    "plan": {
      "model": "llamacpp/gemma-4-e4b",
      "options": {
        "chat_template_kwargs": {
          "enable_thinking": true
        }
      }
    },

    // build: thinking OFF
    "build": {
      "model": "llamacpp/gemma-4-e4b",
      "options": {
        "chat_template_kwargs": {
          "enable_thinking": false
        }
      }
    }
  }
 }
diff --git a/proxy.py b/proxy.py
 #!/usr/bin/env python3
 # /// script
 # requires-python = ">=3.11"
 # dependencies = [
 #   "fastapi",
 #   "httpx",
 #   "uvicorn",
 # ]
 # ///
 """
 Proxy between OpenCode and llama-server.
 Rewrites requests so that OpenCode's `body.body.chat_template_kwargs`
 is lifted to the top-level `chat_template_kwargs` that llama-server expects.
 Logs original and rewritten requests to a log file.

 Usage:
  uv run proxy.py [--listen-port 8081] [--upstream http://127.0.0.1:8080] [--log logs/proxy.log]
  uv run proxy.py --passthrough   # disable rewriting, forward requests as-is
 """

 import argparse
 import json
 import sys
 from datetime import datetime
 from pathlib import Path

 import httpx
 import uvicorn
 from fastapi import FastAPI, Request, Response
 from fastapi.responses import StreamingResponse

 # ---------------------------------------------------------------------------
 # CLI args
 # ---------------------------------------------------------------------------
 parser = argparse.ArgumentParser(description="Transparent logging proxy for llama-server")
 parser.add_argument("--listen-port", type=int, default=8081)
 parser.add_argument("--upstream", default="http://127.0.0.1:8080")
 parser.add_argument("--log", default="logs/proxy.log")
 parser.add_argument("--passthrough", action="store_true", help="Disable request rewriting; forward as-is")
 args, _ = parser.parse_known_args()

 UPSTREAM = args.upstream.rstrip("/")
 PASSTHROUGH = args.passthrough
 LOG_PATH = Path(args.log)
 LOG_PATH.parent.mkdir(parents=True, exist_ok=True)

 app = FastAPI()

 # ---------------------------------------------------------------------------
 # Logging helper
 # ---------------------------------------------------------------------------
 def log(entry: dict) -> None:
    line = json.dumps(entry, ensure_ascii=False)
    with LOG_PATH.open("a", encoding="utf-8") as f:
        f.write(line + "\n")
    print(line, flush=True)


 # ---------------------------------------------------------------------------
 # Proxy handler
 # ---------------------------------------------------------------------------
 @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD"])
 async def proxy(request: Request, path: str):
    url = f"{UPSTREAM}/{path}"
    if request.url.query:
        url += f"?{request.url.query}"

    body_bytes = await request.body()

    # Parse request body for logging
    try:
        body_json = json.loads(body_bytes) if body_bytes else None
    except Exception:
        body_json = body_bytes.decode("utf-8", errors="replace") if body_bytes else None

    ts = datetime.now().isoformat(timespec="milliseconds")
    log({
        "ts": ts,
        "direction": "request_original",
        "method": request.method,
        "path": f"/{path}",
        "headers": dict(request.headers),
        "body": body_json,
    })

    # Rewrite: lift body.body.chat_template_kwargs → top-level chat_template_kwargs
    # OpenCode sends options.body as a nested "body" key; llama-server needs it at top level.
    rewritten = False
    if not PASSTHROUGH and isinstance(body_json, dict) and isinstance(body_json.get("body"), dict):
        nested = body_json.pop("body")
        if "chat_template_kwargs" in nested:
            existing = body_json.get("chat_template_kwargs", {})
            existing.update(nested.pop("chat_template_kwargs"))
            body_json["chat_template_kwargs"] = existing
            rewritten = True
        # merge any remaining keys from nested body
        body_json.update(nested)
        body_bytes = json.dumps(body_json, ensure_ascii=False).encode("utf-8")

    if rewritten:
        log({
            "ts": ts,
            "direction": "request_rewritten",
            "method": request.method,
            "path": f"/{path}",
            "body": body_json,
        })

    headers = {
        k: v for k, v in request.headers.items()
        if k.lower() not in ("host", "content-length", "transfer-encoding")
    }

    is_stream = isinstance(body_json, dict) and body_json.get("stream", False)

    if is_stream:
        # Keep the client alive for the full duration of the stream.
        # Must NOT use `async with` around the client here — the generator
        # runs after this coroutine returns, so the client must outlive it.
        client = httpx.AsyncClient(timeout=300.0)

        async def stream_generator():
            chunks = []
            try:
                async with client.stream(
                    request.method,
                    url,
                    headers=headers,
                    content=body_bytes,
                ) as upstream_resp:
                    log({
                        "ts": datetime.now().isoformat(timespec="milliseconds"),
                        "direction": "response_start",
                        "status": upstream_resp.status_code,
                        "headers": dict(upstream_resp.headers),
                    })
                    async for chunk in upstream_resp.aiter_bytes():
                        chunks.append(chunk)
                        yield chunk
            finally:
                await client.aclose()
                combined = b"".join(chunks).decode("utf-8", errors="replace")
                log({
                    "ts": datetime.now().isoformat(timespec="milliseconds"),
                    "direction": "response_body_stream",
                    "body_raw": combined,
                })

        return StreamingResponse(stream_generator(), media_type="text/event-stream")

    else:
        async with httpx.AsyncClient(timeout=300.0) as client:
            upstream_resp = await client.request(
                request.method,
                url,
                headers=headers,
                content=body_bytes,
            )

        try:
            resp_json = upstream_resp.json()
        except Exception:
            resp_json = upstream_resp.text

        log({
            "ts": datetime.now().isoformat(timespec="milliseconds"),
            "direction": "response",
            "status": upstream_resp.status_code,
            "headers": dict(upstream_resp.headers),
            "body": resp_json,
        })

        return Response(
            content=upstream_resp.content,
            status_code=upstream_resp.status_code,
            headers=dict(upstream_resp.headers),
        )


 if __name__ == "__main__":
    print(f"Proxy listening on http://127.0.0.1:{args.listen_port}", flush=True)
    print(f"Forwarding to: {UPSTREAM}", flush=True)
    print(f"Log file: {LOG_PATH.resolve()}", flush=True)
    print(f"Mode: {'passthrough (no rewriting)' if PASSTHROUGH else 'rewrite (chat_template_kwargs lifting)'}", flush=True)
    uvicorn.run(app, host="127.0.0.1", port=args.listen_port, log_level="warning")
	{
	"$schema": "https://opencode.ai/config.json",

	// llama-server (local) provider — OpenAI-compatible endpoint
	"provider": {
	"llamacpp": {
	"npm": "@ai-sdk/openai-compatible",
	"name": "llama-server (local)",
	"options": {
	"baseURL": "http://127.0.0.1:8081/v1"
	},
	"models": {
	"gemma-4-e4b": {
	"name": "Gemma 4 E4B (Q4_K_S, local)",
	"limit": {
	"context": 131072,
	"output": 8192
	}
	}
	}
	}
	},

	// Default model for all agents unless overridden below
	"model": "llamacpp/gemma-4-e4b",

	"agent": {
	// plan: thinking ON
	"plan": {
	"model": "llamacpp/gemma-4-e4b",
	"options": {
	"chat_template_kwargs": {
	"enable_thinking": true
	}
	}
	},

	// build: thinking OFF
	"build": {
	"model": "llamacpp/gemma-4-e4b",
	"options": {
	"chat_template_kwargs": {
	"enable_thinking": false
	}
	}
	}
	}
	}
	#!/usr/bin/env python3
	# /// script
	# requires-python = ">=3.11"
	# dependencies = [
	# "fastapi",
	# "httpx",
	# "uvicorn",
	# ]
	# ///
	"""
	Proxy between OpenCode and llama-server.
	Rewrites requests so that OpenCode's `body.body.chat_template_kwargs`
	is lifted to the top-level `chat_template_kwargs` that llama-server expects.
	Logs original and rewritten requests to a log file.

	Usage:
	uv run proxy.py [--listen-port 8081] [--upstream http://127.0.0.1:8080] [--log logs/proxy.log]
	uv run proxy.py --passthrough # disable rewriting, forward requests as-is
	"""

	import argparse
	import json
	import sys
	from datetime import datetime
	from pathlib import Path

	import httpx
	import uvicorn
	from fastapi import FastAPI, Request, Response
	from fastapi.responses import StreamingResponse

	# ---------------------------------------------------------------------------
	# CLI args
	# ---------------------------------------------------------------------------
	parser = argparse.ArgumentParser(description="Transparent logging proxy for llama-server")
	parser.add_argument("--listen-port", type=int, default=8081)
	parser.add_argument("--upstream", default="http://127.0.0.1:8080")
	parser.add_argument("--log", default="logs/proxy.log")
	parser.add_argument("--passthrough", action="store_true", help="Disable request rewriting; forward as-is")
	args, _ = parser.parse_known_args()

	UPSTREAM = args.upstream.rstrip("/")
	PASSTHROUGH = args.passthrough
	LOG_PATH = Path(args.log)
	LOG_PATH.parent.mkdir(parents=True, exist_ok=True)

	app = FastAPI()

	# ---------------------------------------------------------------------------
	# Logging helper
	# ---------------------------------------------------------------------------
	def log(entry: dict) -> None:
	line = json.dumps(entry, ensure_ascii=False)
	with LOG_PATH.open("a", encoding="utf-8") as f:
	f.write(line + "\n")
	print(line, flush=True)


	# ---------------------------------------------------------------------------
	# Proxy handler
	# ---------------------------------------------------------------------------
	@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD"])
	async def proxy(request: Request, path: str):
	url = f"{UPSTREAM}/{path}"
	if request.url.query:
	url += f"?{request.url.query}"

	body_bytes = await request.body()

	# Parse request body for logging
	try:
	body_json = json.loads(body_bytes) if body_bytes else None
	except Exception:
	body_json = body_bytes.decode("utf-8", errors="replace") if body_bytes else None

	ts = datetime.now().isoformat(timespec="milliseconds")
	log({
	"ts": ts,
	"direction": "request_original",
	"method": request.method,
	"path": f"/{path}",
	"headers": dict(request.headers),
	"body": body_json,
	})

	# Rewrite: lift body.body.chat_template_kwargs → top-level chat_template_kwargs
	# OpenCode sends options.body as a nested "body" key; llama-server needs it at top level.
	rewritten = False
	if not PASSTHROUGH and isinstance(body_json, dict) and isinstance(body_json.get("body"), dict):
	nested = body_json.pop("body")
	if "chat_template_kwargs" in nested:
	existing = body_json.get("chat_template_kwargs", {})
	existing.update(nested.pop("chat_template_kwargs"))
	body_json["chat_template_kwargs"] = existing
	rewritten = True
	# merge any remaining keys from nested body
	body_json.update(nested)
	body_bytes = json.dumps(body_json, ensure_ascii=False).encode("utf-8")

	if rewritten:
	log({
	"ts": ts,
	"direction": "request_rewritten",
	"method": request.method,
	"path": f"/{path}",
	"body": body_json,
	})

	headers = {
	k: v for k, v in request.headers.items()
	if k.lower() not in ("host", "content-length", "transfer-encoding")
	}

	is_stream = isinstance(body_json, dict) and body_json.get("stream", False)

	if is_stream:
	# Keep the client alive for the full duration of the stream.
	# Must NOT use `async with` around the client here — the generator
	# runs after this coroutine returns, so the client must outlive it.
	client = httpx.AsyncClient(timeout=300.0)

	async def stream_generator():
	chunks = []
	try:
	async with client.stream(
	request.method,
	url,
	headers=headers,
	content=body_bytes,
	) as upstream_resp:
	log({
	"ts": datetime.now().isoformat(timespec="milliseconds"),
	"direction": "response_start",
	"status": upstream_resp.status_code,
	"headers": dict(upstream_resp.headers),
	})
	async for chunk in upstream_resp.aiter_bytes():
	chunks.append(chunk)
	yield chunk
	finally:
	await client.aclose()
	combined = b"".join(chunks).decode("utf-8", errors="replace")
	log({
	"ts": datetime.now().isoformat(timespec="milliseconds"),
	"direction": "response_body_stream",
	"body_raw": combined,
	})

	return StreamingResponse(stream_generator(), media_type="text/event-stream")

	else:
	async with httpx.AsyncClient(timeout=300.0) as client:
	upstream_resp = await client.request(
	request.method,
	url,
	headers=headers,
	content=body_bytes,
	)

	try:
	resp_json = upstream_resp.json()
	except Exception:
	resp_json = upstream_resp.text

	log({
	"ts": datetime.now().isoformat(timespec="milliseconds"),
	"direction": "response",
	"status": upstream_resp.status_code,
	"headers": dict(upstream_resp.headers),
	"body": resp_json,
	})

	return Response(
	content=upstream_resp.content,
	status_code=upstream_resp.status_code,
	headers=dict(upstream_resp.headers),
	)


	if __name__ == "__main__":
	print(f"Proxy listening on http://127.0.0.1:{args.listen_port}", flush=True)
	print(f"Forwarding to: {UPSTREAM}", flush=True)
	print(f"Log file: {LOG_PATH.resolve()}", flush=True)
	print(f"Mode: {'passthrough (no rewriting)' if PASSTHROUGH else 'rewrite (chat_template_kwargs lifting)'}", flush=True)
	uvicorn.run(app, host="127.0.0.1", port=args.listen_port, log_level="warning")