Skip to content

Instantly share code, notes, and snippets.

@sasasin
Created June 2, 2026 16:24
Show Gist options
  • Select an option

  • Save sasasin/0ace5f3d1ff3ad3f672284e292231b22 to your computer and use it in GitHub Desktop.

Select an option

Save sasasin/0ace5f3d1ff3ad3f672284e292231b22 to your computer and use it in GitHub Desktop.
OpenCode 側のモードによって thinking ON/OFF を llama.cpp に送るサンプル
{
"$schema": "https://opencode.ai/config.json",
// llama-server (local) provider — OpenAI-compatible endpoint
"provider": {
"llamacpp": {
"npm": "@ai-sdk/openai-compatible",
"name": "llama-server (local)",
"options": {
"baseURL": "http://127.0.0.1:8081/v1"
},
"models": {
"gemma-4-e4b": {
"name": "Gemma 4 E4B (Q4_K_S, local)",
"limit": {
"context": 131072,
"output": 8192
}
}
}
}
},
// Default model for all agents unless overridden below
"model": "llamacpp/gemma-4-e4b",
"agent": {
// plan: thinking ON
"plan": {
"model": "llamacpp/gemma-4-e4b",
"options": {
"chat_template_kwargs": {
"enable_thinking": true
}
}
},
// build: thinking OFF
"build": {
"model": "llamacpp/gemma-4-e4b",
"options": {
"chat_template_kwargs": {
"enable_thinking": false
}
}
}
}
}
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "fastapi",
# "httpx",
# "uvicorn",
# ]
# ///
"""
Proxy between OpenCode and llama-server.
Rewrites requests so that OpenCode's `body.body.chat_template_kwargs`
is lifted to the top-level `chat_template_kwargs` that llama-server expects.
Logs original and rewritten requests to a log file.
Usage:
uv run proxy.py [--listen-port 8081] [--upstream http://127.0.0.1:8080] [--log logs/proxy.log]
uv run proxy.py --passthrough # disable rewriting, forward requests as-is
"""
import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
import httpx
import uvicorn
from fastapi import FastAPI, Request, Response
from fastapi.responses import StreamingResponse
# ---------------------------------------------------------------------------
# CLI args
# ---------------------------------------------------------------------------
parser = argparse.ArgumentParser(description="Transparent logging proxy for llama-server")
parser.add_argument("--listen-port", type=int, default=8081)
parser.add_argument("--upstream", default="http://127.0.0.1:8080")
parser.add_argument("--log", default="logs/proxy.log")
parser.add_argument("--passthrough", action="store_true", help="Disable request rewriting; forward as-is")
args, _ = parser.parse_known_args()
UPSTREAM = args.upstream.rstrip("/")
PASSTHROUGH = args.passthrough
LOG_PATH = Path(args.log)
LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
app = FastAPI()
# ---------------------------------------------------------------------------
# Logging helper
# ---------------------------------------------------------------------------
def log(entry: dict) -> None:
line = json.dumps(entry, ensure_ascii=False)
with LOG_PATH.open("a", encoding="utf-8") as f:
f.write(line + "\n")
print(line, flush=True)
# ---------------------------------------------------------------------------
# Proxy handler
# ---------------------------------------------------------------------------
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD"])
async def proxy(request: Request, path: str):
url = f"{UPSTREAM}/{path}"
if request.url.query:
url += f"?{request.url.query}"
body_bytes = await request.body()
# Parse request body for logging
try:
body_json = json.loads(body_bytes) if body_bytes else None
except Exception:
body_json = body_bytes.decode("utf-8", errors="replace") if body_bytes else None
ts = datetime.now().isoformat(timespec="milliseconds")
log({
"ts": ts,
"direction": "request_original",
"method": request.method,
"path": f"/{path}",
"headers": dict(request.headers),
"body": body_json,
})
# Rewrite: lift body.body.chat_template_kwargs → top-level chat_template_kwargs
# OpenCode sends options.body as a nested "body" key; llama-server needs it at top level.
rewritten = False
if not PASSTHROUGH and isinstance(body_json, dict) and isinstance(body_json.get("body"), dict):
nested = body_json.pop("body")
if "chat_template_kwargs" in nested:
existing = body_json.get("chat_template_kwargs", {})
existing.update(nested.pop("chat_template_kwargs"))
body_json["chat_template_kwargs"] = existing
rewritten = True
# merge any remaining keys from nested body
body_json.update(nested)
body_bytes = json.dumps(body_json, ensure_ascii=False).encode("utf-8")
if rewritten:
log({
"ts": ts,
"direction": "request_rewritten",
"method": request.method,
"path": f"/{path}",
"body": body_json,
})
headers = {
k: v for k, v in request.headers.items()
if k.lower() not in ("host", "content-length", "transfer-encoding")
}
is_stream = isinstance(body_json, dict) and body_json.get("stream", False)
if is_stream:
# Keep the client alive for the full duration of the stream.
# Must NOT use `async with` around the client here — the generator
# runs after this coroutine returns, so the client must outlive it.
client = httpx.AsyncClient(timeout=300.0)
async def stream_generator():
chunks = []
try:
async with client.stream(
request.method,
url,
headers=headers,
content=body_bytes,
) as upstream_resp:
log({
"ts": datetime.now().isoformat(timespec="milliseconds"),
"direction": "response_start",
"status": upstream_resp.status_code,
"headers": dict(upstream_resp.headers),
})
async for chunk in upstream_resp.aiter_bytes():
chunks.append(chunk)
yield chunk
finally:
await client.aclose()
combined = b"".join(chunks).decode("utf-8", errors="replace")
log({
"ts": datetime.now().isoformat(timespec="milliseconds"),
"direction": "response_body_stream",
"body_raw": combined,
})
return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
async with httpx.AsyncClient(timeout=300.0) as client:
upstream_resp = await client.request(
request.method,
url,
headers=headers,
content=body_bytes,
)
try:
resp_json = upstream_resp.json()
except Exception:
resp_json = upstream_resp.text
log({
"ts": datetime.now().isoformat(timespec="milliseconds"),
"direction": "response",
"status": upstream_resp.status_code,
"headers": dict(upstream_resp.headers),
"body": resp_json,
})
return Response(
content=upstream_resp.content,
status_code=upstream_resp.status_code,
headers=dict(upstream_resp.headers),
)
if __name__ == "__main__":
print(f"Proxy listening on http://127.0.0.1:{args.listen_port}", flush=True)
print(f"Forwarding to: {UPSTREAM}", flush=True)
print(f"Log file: {LOG_PATH.resolve()}", flush=True)
print(f"Mode: {'passthrough (no rewriting)' if PASSTHROUGH else 'rewrite (chat_template_kwargs lifting)'}", flush=True)
uvicorn.run(app, host="127.0.0.1", port=args.listen_port, log_level="warning")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment