Skip to content

Instantly share code, notes, and snippets.

@tomsiwik
Last active May 6, 2025 23:44
Show Gist options
  • Save tomsiwik/162f50ee29568f776454ed6fc8f7009a to your computer and use it in GitHub Desktop.
Save tomsiwik/162f50ee29568f776454ed6fc8f7009a to your computer and use it in GitHub Desktop.
How to proxy Cursor → OpenAPI → Python → Ollama

Make sure you run ollama serve (this is OpenAI-compatible http://localhost:11434)

Open cursor model settings and specify:

  • Specify http://localhost:8000/v1
  • Hit "Save" (no need to activate key)
  • Select gpt-4o and enjoy local model
uv venv
source .venv/bin/activate
uv pip install -r requirements.txt
python ./run.py

This is a convoluted vibe coded script - it works, feel free to improve/simplify it or add your own twist.

Enjoy!

aiohappyeyeballs==2.6.1
aiohttp==3.11.18
aiosignal==1.3.2
annotated-types==0.7.0
anyio==4.9.0
attrs==25.3.0
certifi==2025.4.26
charset-normalizer==3.4.2
click==8.1.8
dataclasses-json==0.6.7
fastapi==0.115.12
frozenlist==1.6.0
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
httpx-sse==0.4.0
idna==3.10
jsonpatch==1.33
jsonpointer==3.0.0
langchain==0.3.25
langchain-community==0.3.23
langchain-core==0.3.58
langchain-ollama==0.3.2
langchain-text-splitters==0.3.8
langsmith==0.3.42
marshmallow==3.26.1
multidict==6.4.3
mypy-extensions==1.1.0
numpy==2.2.5
ollama==0.4.8
orjson==3.10.18
packaging==24.2
propcache==0.3.1
pydantic==2.11.4
pydantic-core==2.33.2
pydantic-settings==2.9.1
python-dotenv==1.1.0
pyyaml==6.0.2
requests==2.32.3
requests-toolbelt==1.0.0
sniffio==1.3.1
sqlalchemy==2.0.40
starlette==0.46.2
tenacity==9.1.2
typing-extensions==4.13.2
typing-inspect==0.9.0
typing-inspection==0.4.0
urllib3==2.4.0
uvicorn==0.34.2
yarl==1.20.0
zstandard==0.23.0
import os
import logging
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
import httpx
import json
from typing import Dict, Any
from dotenv import load_dotenv
import time
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from agents.base import DefaultAgent
from agents.driver import DriverAgent
from agents.navigator import NavigatorAgent
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
load_dotenv()
app = FastAPI()
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allows all origins
allow_credentials=True,
allow_methods=["*"], # Allows all methods
allow_headers=["*"], # Allows all headers
)
OLLAMA_BASE = os.getenv('OLLAMA_API_BASE', 'http://localhost:11434/v1')
DEFAULT_MODEL = os.getenv('OLLAMA_MODEL', 'deepseek-r1:32b')
async def stream_response(response: httpx.Response):
async for chunk in response.aiter_bytes():
yield chunk
# Add right after your existing constants:
AVAILABLE_AGENTS = {
"driver": DriverAgent(),
"navigator": NavigatorAgent(),
"default": DefaultAgent()
}
MODEL_MAPPING = {
"gpt-4o": DEFAULT_MODEL,
"gpt-4.1": DEFAULT_MODEL,
}
@app.post("/v1/chat/completions")
async def handle_chat_completion(request: Request) -> Dict[str, Any]:
try:
request_data = await request.json()
logger.info("Received chat completion request:")
logger.info(json.dumps(request_data, indent=2))
model = request_data.get('model', DEFAULT_MODEL)
model = MODEL_MAPPING.get(model, model)
agent_name = request_data.get('agent', 'default')
selected_agent = AVAILABLE_AGENTS.get(agent_name)
messages = request_data.get('messages', [])
stream = request_data.get('stream', False)
# Convert messages to langchain format
langchain_messages = []
for msg in messages:
content = msg.get('content', '')
if msg['role'] == 'system':
langchain_messages.append(SystemMessage(content=content))
elif msg['role'] == 'user':
langchain_messages.append(HumanMessage(content=content))
elif msg['role'] == 'assistant':
langchain_messages.append(AIMessage(content=content))
# Check if request contains function calls
functions = request_data.get('functions', None)
function_call = request_data.get('function_call', None)
# Use agent if there are function calls, otherwise use normal chat
if functions or function_call:
response_content = await selected_agent.process_with_functions(
request_data['messages'][-1]['content'],
functions,
function_call
)
formatted_response = {
"id": f"chatcmpl-{os.urandom(12).hex()}",
"object": "chat.completion",
"created": int(time.time()),
"model": model,
"system_fingerprint": "fp_" + os.urandom(5).hex(),
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": response_content,
},
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": -1,
"completion_tokens": -1,
"total_tokens": -1
}
}
return formatted_response
chat_model = ChatOllama(
model=model,
base_url=OLLAMA_BASE,
streaming=stream
)
if stream:
async def generate_stream():
current_content = ""
stream_gen = chat_model.astream(langchain_messages)
async def transform_chunk(chunk):
nonlocal current_content
if chunk.content:
current_content += chunk.content
response_json = {
"id": f"chatcmpl-{os.urandom(12).hex()}",
"object": "chat.completion.chunk",
"created": int(time.time()),
"model": model,
"system_fingerprint": "fp_" + os.urandom(5).hex(),
"choices": [{
"index": 0,
"delta": {
"content": chunk.content
},
"finish_reason": None
}]
}
return f"data: {json.dumps(response_json)}\n\n"
return ""
async for chunk in stream_gen:
yield await transform_chunk(chunk)
# Send final chunk
final_json = {
"id": f"chatcmpl-{os.urandom(12).hex()}",
"object": "chat.completion.chunk",
"created": int(time.time()),
"model": model,
"system_fingerprint": "fp_" + os.urandom(5).hex(),
"choices": [{
"index": 0,
"delta": {},
"finish_reason": "stop"
}]
}
yield f"data: {json.dumps(final_json)}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(
generate_stream(),
media_type="text/event-stream"
)
else:
response = await chat_model.ainvoke(langchain_messages)
formatted_response = {
"id": f"chatcmpl-{os.urandom(12).hex()}",
"object": "chat.completion",
"created": int(time.time()),
"model": model,
"system_fingerprint": "fp_" + os.urandom(5).hex(),
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": response.content,
},
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": -1,
"completion_tokens": -1,
"total_tokens": -1
}
}
logger.info("Formatted response:")
logger.info(json.dumps(formatted_response, indent=2))
return formatted_response
except Exception as e:
logger.error(f"Error processing request: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/v1/models")
async def list_models():
try:
async with httpx.AsyncClient() as client:
response = await client.get(f"{OLLAMA_BASE}/api/tags")
models = response.json()
return {
"object": "list",
"data": [{"id": model["name"],
"object": "model",
"created": int(time.time()),
"owned_by": "organization-owner"}
for model in models.get("models", [])]
}
except Exception as e:
logger.error(f"Error listing models: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
host = os.getenv('HOST', '0.0.0.0')
port = int(os.getenv('PORT', '8000'))
logger.info(f"Starting API compatibility layer on {host}:{port}")
logger.info(f"Proxying requests to Ollama at {OLLAMA_BASE}")
import uvicorn
uvicorn.run(app, host=host, port=port, log_level="info")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment