Created
June 2, 2026 16:24
-
-
Save sasasin/0ace5f3d1ff3ad3f672284e292231b22 to your computer and use it in GitHub Desktop.
OpenCode 側のモードによって thinking ON/OFF を llama.cpp に送るサンプル
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Show hidden characters
| { | |
| "$schema": "https://opencode.ai/config.json", | |
| // llama-server (local) provider — OpenAI-compatible endpoint | |
| "provider": { | |
| "llamacpp": { | |
| "npm": "@ai-sdk/openai-compatible", | |
| "name": "llama-server (local)", | |
| "options": { | |
| "baseURL": "http://127.0.0.1:8081/v1" | |
| }, | |
| "models": { | |
| "gemma-4-e4b": { | |
| "name": "Gemma 4 E4B (Q4_K_S, local)", | |
| "limit": { | |
| "context": 131072, | |
| "output": 8192 | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| // Default model for all agents unless overridden below | |
| "model": "llamacpp/gemma-4-e4b", | |
| "agent": { | |
| // plan: thinking ON | |
| "plan": { | |
| "model": "llamacpp/gemma-4-e4b", | |
| "options": { | |
| "chat_template_kwargs": { | |
| "enable_thinking": true | |
| } | |
| } | |
| }, | |
| // build: thinking OFF | |
| "build": { | |
| "model": "llamacpp/gemma-4-e4b", | |
| "options": { | |
| "chat_template_kwargs": { | |
| "enable_thinking": false | |
| } | |
| } | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # /// script | |
| # requires-python = ">=3.11" | |
| # dependencies = [ | |
| # "fastapi", | |
| # "httpx", | |
| # "uvicorn", | |
| # ] | |
| # /// | |
| """ | |
| Proxy between OpenCode and llama-server. | |
| Rewrites requests so that OpenCode's `body.body.chat_template_kwargs` | |
| is lifted to the top-level `chat_template_kwargs` that llama-server expects. | |
| Logs original and rewritten requests to a log file. | |
| Usage: | |
| uv run proxy.py [--listen-port 8081] [--upstream http://127.0.0.1:8080] [--log logs/proxy.log] | |
| uv run proxy.py --passthrough # disable rewriting, forward requests as-is | |
| """ | |
| import argparse | |
| import json | |
| import sys | |
| from datetime import datetime | |
| from pathlib import Path | |
| import httpx | |
| import uvicorn | |
| from fastapi import FastAPI, Request, Response | |
| from fastapi.responses import StreamingResponse | |
| # --------------------------------------------------------------------------- | |
| # CLI args | |
| # --------------------------------------------------------------------------- | |
| parser = argparse.ArgumentParser(description="Transparent logging proxy for llama-server") | |
| parser.add_argument("--listen-port", type=int, default=8081) | |
| parser.add_argument("--upstream", default="http://127.0.0.1:8080") | |
| parser.add_argument("--log", default="logs/proxy.log") | |
| parser.add_argument("--passthrough", action="store_true", help="Disable request rewriting; forward as-is") | |
| args, _ = parser.parse_known_args() | |
| UPSTREAM = args.upstream.rstrip("/") | |
| PASSTHROUGH = args.passthrough | |
| LOG_PATH = Path(args.log) | |
| LOG_PATH.parent.mkdir(parents=True, exist_ok=True) | |
| app = FastAPI() | |
| # --------------------------------------------------------------------------- | |
| # Logging helper | |
| # --------------------------------------------------------------------------- | |
| def log(entry: dict) -> None: | |
| line = json.dumps(entry, ensure_ascii=False) | |
| with LOG_PATH.open("a", encoding="utf-8") as f: | |
| f.write(line + "\n") | |
| print(line, flush=True) | |
| # --------------------------------------------------------------------------- | |
| # Proxy handler | |
| # --------------------------------------------------------------------------- | |
| @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD"]) | |
| async def proxy(request: Request, path: str): | |
| url = f"{UPSTREAM}/{path}" | |
| if request.url.query: | |
| url += f"?{request.url.query}" | |
| body_bytes = await request.body() | |
| # Parse request body for logging | |
| try: | |
| body_json = json.loads(body_bytes) if body_bytes else None | |
| except Exception: | |
| body_json = body_bytes.decode("utf-8", errors="replace") if body_bytes else None | |
| ts = datetime.now().isoformat(timespec="milliseconds") | |
| log({ | |
| "ts": ts, | |
| "direction": "request_original", | |
| "method": request.method, | |
| "path": f"/{path}", | |
| "headers": dict(request.headers), | |
| "body": body_json, | |
| }) | |
| # Rewrite: lift body.body.chat_template_kwargs → top-level chat_template_kwargs | |
| # OpenCode sends options.body as a nested "body" key; llama-server needs it at top level. | |
| rewritten = False | |
| if not PASSTHROUGH and isinstance(body_json, dict) and isinstance(body_json.get("body"), dict): | |
| nested = body_json.pop("body") | |
| if "chat_template_kwargs" in nested: | |
| existing = body_json.get("chat_template_kwargs", {}) | |
| existing.update(nested.pop("chat_template_kwargs")) | |
| body_json["chat_template_kwargs"] = existing | |
| rewritten = True | |
| # merge any remaining keys from nested body | |
| body_json.update(nested) | |
| body_bytes = json.dumps(body_json, ensure_ascii=False).encode("utf-8") | |
| if rewritten: | |
| log({ | |
| "ts": ts, | |
| "direction": "request_rewritten", | |
| "method": request.method, | |
| "path": f"/{path}", | |
| "body": body_json, | |
| }) | |
| headers = { | |
| k: v for k, v in request.headers.items() | |
| if k.lower() not in ("host", "content-length", "transfer-encoding") | |
| } | |
| is_stream = isinstance(body_json, dict) and body_json.get("stream", False) | |
| if is_stream: | |
| # Keep the client alive for the full duration of the stream. | |
| # Must NOT use `async with` around the client here — the generator | |
| # runs after this coroutine returns, so the client must outlive it. | |
| client = httpx.AsyncClient(timeout=300.0) | |
| async def stream_generator(): | |
| chunks = [] | |
| try: | |
| async with client.stream( | |
| request.method, | |
| url, | |
| headers=headers, | |
| content=body_bytes, | |
| ) as upstream_resp: | |
| log({ | |
| "ts": datetime.now().isoformat(timespec="milliseconds"), | |
| "direction": "response_start", | |
| "status": upstream_resp.status_code, | |
| "headers": dict(upstream_resp.headers), | |
| }) | |
| async for chunk in upstream_resp.aiter_bytes(): | |
| chunks.append(chunk) | |
| yield chunk | |
| finally: | |
| await client.aclose() | |
| combined = b"".join(chunks).decode("utf-8", errors="replace") | |
| log({ | |
| "ts": datetime.now().isoformat(timespec="milliseconds"), | |
| "direction": "response_body_stream", | |
| "body_raw": combined, | |
| }) | |
| return StreamingResponse(stream_generator(), media_type="text/event-stream") | |
| else: | |
| async with httpx.AsyncClient(timeout=300.0) as client: | |
| upstream_resp = await client.request( | |
| request.method, | |
| url, | |
| headers=headers, | |
| content=body_bytes, | |
| ) | |
| try: | |
| resp_json = upstream_resp.json() | |
| except Exception: | |
| resp_json = upstream_resp.text | |
| log({ | |
| "ts": datetime.now().isoformat(timespec="milliseconds"), | |
| "direction": "response", | |
| "status": upstream_resp.status_code, | |
| "headers": dict(upstream_resp.headers), | |
| "body": resp_json, | |
| }) | |
| return Response( | |
| content=upstream_resp.content, | |
| status_code=upstream_resp.status_code, | |
| headers=dict(upstream_resp.headers), | |
| ) | |
| if __name__ == "__main__": | |
| print(f"Proxy listening on http://127.0.0.1:{args.listen_port}", flush=True) | |
| print(f"Forwarding to: {UPSTREAM}", flush=True) | |
| print(f"Log file: {LOG_PATH.resolve()}", flush=True) | |
| print(f"Mode: {'passthrough (no rewriting)' if PASSTHROUGH else 'rewrite (chat_template_kwargs lifting)'}", flush=True) | |
| uvicorn.run(app, host="127.0.0.1", port=args.listen_port, log_level="warning") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment