A lightweight HTTP proxy that makes Ollama compatible with OpenAI-compatible clients while disabling "thinking" mode for faster responses and ensuring sufficient context window for tool-heavy requests.
If you're using Clawdbot with Ollama, this proxy solves three critical issues:
| Issue | Symptom | Proxy Fix |
|---|---|---|
| Thinking mode | 100% CPU for 2-5 min, slow responses | think: false injection |
| Context overflow | Crashes with 23+ tools | num_ctx: 32768 |
| Infinite hangs | Requests never return | 120s timeout + auto-restart |
Add to your clawdbot.json:
{
"models": {
"providers": {
"ollama": {
"baseUrl": "http://127.0.0.1:11435/v1",
"apiKey": "ollama-local",
"api": "openai-completions",
"timeout": 120000
}
}
},
"auth": {
"profiles": {
"ollama:default": {
"provider": "ollama",
"mode": "token"
}
}
}
}Then restart: clawdbot gateway restart
The Ollama No-Think Proxy sits between your OpenAI-compatible client and Ollama:
βββββββββββββββββββ βββββββββββββββββββββββ ββββββββββββββββ
β Clawdbot/ ββββββΆβ No-Think Proxy ββββββΆβ Ollama β
β OpenAI Client β β (Port 11435) β β (Port 11434)β
βββββββββββββββββββ βββββββββββββββββββββββ ββββββββββββββββ
β
βΌ
β’ Injects think: false
β’ Sets num_ctx: 32768
β’ 120s timeout
β’ Health monitoring
When using GLM-4.7 or similar reasoning-capable models through Ollama, the default "thinking" mode causes:
- 2-5 minute response times for simple queries
- 100% CPU utilization during "thinking"
- Context overflow with 23+ tools (default 4096 tokens insufficient)
This proxy transparently injects think: false and num_ctx: 32768 into all requests.
- Node.js 18+
- Ollama installed and running on port 11434
- systemd (recommended for auto-restart)
Create /home/USER/ollama-nothink-proxy.js:
#!/usr/bin/env node
/**
* Ollama No-Think Proxy v7 (with timeout)
*
* Features:
* - Converts OpenAI format β Ollama native format
* - Sets think: false to disable reasoning output
* - Sets num_ctx: 32768 for sufficient context
* - Passes tools through and converts tool_calls back to OpenAI format
* - Simulated streaming (Ollama non-stream β OpenAI SSE)
* - Request timeout: 120s (prevents infinite hangs)
*/
const http = require('http');
const PROXY_PORT = 11435;
const OLLAMA_HOST = '127.0.0.1';
const OLLAMA_PORT = 11434;
const REQUEST_TIMEOUT_MS = 120000; // 120 seconds
const HEALTH_CHECK_INTERVAL_MS = 30000; // 30 seconds
const STARTUP_RETRY_MS = 5000; // 5 seconds
const log = (msg) => console.log(`[${new Date().toISOString()}] ${msg}`);
// Health check: verify Ollama is responding
async function checkOllamaHealth() {
return new Promise((resolve) => {
const req = http.request({
hostname: OLLAMA_HOST,
port: OLLAMA_PORT,
path: '/api/tags',
method: 'GET',
timeout: 5000
}, (res) => {
let data = '';
res.on('data', chunk => data += chunk);
res.on('end', () => resolve(res.statusCode === 200));
});
req.on('error', () => resolve(false));
req.on('timeout', () => { req.destroy(); resolve(false); });
req.end();
});
}
// Wait for Ollama to be ready at startup
async function waitForOllama() {
log('Checking Ollama availability...');
for (let i = 0; i < 12; i++) { // Try for 1 minute
if (await checkOllamaHealth()) {
log('β
Ollama is ready');
return true;
}
log(`β³ Ollama not ready, retry in ${STARTUP_RETRY_MS/1000}s...`);
await new Promise(r => setTimeout(r, STARTUP_RETRY_MS));
}
log('β Ollama not available after 60s, exiting');
process.exit(1);
}
// Periodic health check
let lastHealthy = Date.now();
setInterval(async () => {
if (await checkOllamaHealth()) {
lastHealthy = Date.now();
} else {
const downtime = (Date.now() - lastHealthy) / 1000;
log(`β οΈ Ollama health check failed (down ${downtime.toFixed(0)}s)`);
if (downtime > 120) {
log('β Ollama down >2min, exiting for restart');
process.exit(1);
}
}
}, HEALTH_CHECK_INTERVAL_MS);
const server = http.createServer((req, res) => {
let body = '';
req.on('data', chunk => body += chunk);
req.on('end', () => {
log(`${req.method} ${req.url} (body: ${body.length} bytes)`);
// For chat completions, use native Ollama API with think: false
if (req.url === '/v1/chat/completions' && req.method === 'POST') {
try {
const openaiReq = JSON.parse(body);
const wantsStreaming = openaiReq.stream === true;
const hasTools = openaiReq.tools && openaiReq.tools.length > 0;
log(`Model: ${openaiReq.model}, Messages: ${openaiReq.messages?.length || 0}, Stream: ${wantsStreaming}, Tools: ${hasTools ? openaiReq.tools.length : 0}`);
// Convert messages: flatten content arrays to strings
const convertedMessages = openaiReq.messages.map(msg => {
let content = msg.content;
if (Array.isArray(content)) {
content = content
.filter(part => part.type === 'text' || typeof part === 'string')
.map(part => typeof part === 'string' ? part : part.text)
.join('\n');
}
return { role: msg.role, content };
});
// Convert to Ollama native format
const ollamaReq = {
model: openaiReq.model,
messages: convertedMessages,
stream: false,
think: false,
options: {
num_ctx: 32768
}
};
// Pass through tools if present
if (hasTools) {
ollamaReq.tools = openaiReq.tools;
}
if (openaiReq.max_tokens) ollamaReq.options.num_predict = openaiReq.max_tokens;
if (openaiReq.temperature !== undefined) ollamaReq.options.temperature = openaiReq.temperature;
const ollamaBody = JSON.stringify(ollamaReq);
log(`Forwarding to Ollama /api/chat (${ollamaBody.length} bytes, tools: ${hasTools})`);
const proxyReq = http.request({
hostname: OLLAMA_HOST,
port: OLLAMA_PORT,
path: '/api/chat',
method: 'POST',
timeout: REQUEST_TIMEOUT_MS,
headers: {
'Content-Type': 'application/json',
'Content-Length': Buffer.byteLength(ollamaBody)
}
}, proxyRes => {
let responseBody = '';
proxyRes.on('data', chunk => responseBody += chunk);
proxyRes.on('end', () => {
log(`Ollama response: ${proxyRes.statusCode} (${responseBody.length} bytes)`);
if (proxyRes.statusCode >= 400) {
log(`Ollama error: ${responseBody}`);
res.writeHead(proxyRes.statusCode, { 'Content-Type': 'application/json' });
res.end(responseBody);
return;
}
try {
const ollamaRes = JSON.parse(responseBody);
const content = ollamaRes.message?.content || '';
const ollamaToolCalls = ollamaRes.message?.tool_calls || null;
const chatId = `chatcmpl-${Date.now()}`;
// Convert Ollama tool_calls to OpenAI format
let openaiToolCalls = null;
if (ollamaToolCalls && ollamaToolCalls.length > 0) {
openaiToolCalls = ollamaToolCalls.map((tc, idx) => ({
id: tc.id || `call_${Date.now()}_${idx}`,
type: 'function',
function: {
name: tc.function.name,
arguments: typeof tc.function.arguments === 'string'
? tc.function.arguments
: JSON.stringify(tc.function.arguments)
}
}));
log(`Tool calls: ${openaiToolCalls.map(tc => tc.function.name).join(', ')}`);
}
if (wantsStreaming) {
res.writeHead(200, {
'Content-Type': 'text/event-stream',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive'
});
const delta = { role: 'assistant' };
if (content) delta.content = content;
if (openaiToolCalls) delta.tool_calls = openaiToolCalls;
const chunk = {
id: chatId,
object: 'chat.completion.chunk',
created: Math.floor(Date.now() / 1000),
model: ollamaRes.model,
choices: [{ index: 0, delta: delta, finish_reason: null }]
};
res.write(`data: ${JSON.stringify(chunk)}\n\n`);
const finishReason = openaiToolCalls ? 'tool_calls' : 'stop';
const finishChunk = {
id: chatId,
object: 'chat.completion.chunk',
created: Math.floor(Date.now() / 1000),
model: ollamaRes.model,
choices: [{ index: 0, delta: {}, finish_reason: finishReason }]
};
res.write(`data: ${JSON.stringify(finishChunk)}\n\n`);
res.write('data: [DONE]\n\n');
res.end();
log(`Sent streaming response (finish: ${finishReason})`);
} else {
const message = { role: 'assistant', content: content };
if (openaiToolCalls) message.tool_calls = openaiToolCalls;
const finishReason = openaiToolCalls ? 'tool_calls' : (ollamaRes.done_reason || 'stop');
const openaiRes = {
id: chatId,
object: 'chat.completion',
created: Math.floor(Date.now() / 1000),
model: ollamaRes.model,
choices: [{
index: 0,
message: message,
finish_reason: finishReason
}],
usage: {
prompt_tokens: ollamaRes.prompt_eval_count || 0,
completion_tokens: ollamaRes.eval_count || 0,
total_tokens: (ollamaRes.prompt_eval_count || 0) + (ollamaRes.eval_count || 0)
}
};
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify(openaiRes));
}
} catch (e) {
log(`Parse error: ${e.message}`);
res.writeHead(500, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ error: e.message }));
}
});
});
proxyReq.on('error', e => {
log(`Ollama error: ${e.message}`);
res.writeHead(502, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ error: `Ollama error: ${e.message}` }));
});
proxyReq.on('timeout', () => {
log(`Ollama timeout after ${REQUEST_TIMEOUT_MS/1000}s`);
proxyReq.destroy();
res.writeHead(504, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ error: `Ollama timeout after ${REQUEST_TIMEOUT_MS/1000}s` }));
});
proxyReq.write(ollamaBody);
proxyReq.end();
return;
} catch (e) {
log(`Request parse error: ${e.message}`);
res.writeHead(400, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ error: e.message }));
return;
}
}
// Handle native Ollama API endpoints
if ((req.url === '/api/chat' || req.url === '/api/generate') && req.method === 'POST' && body) {
try {
const parsed = JSON.parse(body);
let modified = false;
if (parsed.think !== false) {
parsed.think = false;
modified = true;
}
if (!parsed.options) parsed.options = {};
if (!parsed.options.num_ctx || parsed.options.num_ctx < 32768) {
parsed.options.num_ctx = 32768;
modified = true;
}
if (modified) {
log(`Injected think:false + num_ctx:32768 into native ${req.url}`);
}
body = JSON.stringify(parsed);
} catch (e) {
log(`Failed to parse native request: ${e.message}`);
}
}
// Pass through all other requests
log(`Passthrough to Ollama ${req.url}`);
const proxyReq = http.request({
hostname: OLLAMA_HOST,
port: OLLAMA_PORT,
path: req.url,
method: req.method,
timeout: REQUEST_TIMEOUT_MS,
headers: {
...req.headers,
'Content-Length': Buffer.byteLength(body || '')
}
}, proxyRes => {
res.writeHead(proxyRes.statusCode, proxyRes.headers);
proxyRes.pipe(res);
});
proxyReq.on('error', e => {
res.writeHead(502, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ error: e.message }));
});
proxyReq.on('timeout', () => {
log(`Passthrough timeout after ${REQUEST_TIMEOUT_MS/1000}s`);
proxyReq.destroy();
res.writeHead(504, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ error: `Timeout after ${REQUEST_TIMEOUT_MS/1000}s` }));
});
if (body) proxyReq.write(body);
proxyReq.end();
});
});
// Startup
(async () => {
await waitForOllama();
server.listen(PROXY_PORT, '127.0.0.1', () => {
log(`π Ollama No-Think Proxy v7 (timeout: ${REQUEST_TIMEOUT_MS/1000}s) on http://127.0.0.1:${PROXY_PORT}`);
log(`π Health check every ${HEALTH_CHECK_INTERVAL_MS/1000}s, auto-exit if Ollama down >2min`);
});
})();Create ~/.config/systemd/user/ollama-nothink-proxy.service:
[Unit]
Description=Ollama No-Think Proxy
After=network.target
Wants=ollama.service
[Service]
Type=simple
ExecStart=/usr/bin/node /home/USER/ollama-nothink-proxy.js
Restart=always
RestartSec=5
[Install]
WantedBy=default.targetReplace /home/USER/ with your actual home directory path.
systemctl --user daemon-reload
systemctl --user enable ollama-nothink-proxy
systemctl --user start ollama-nothink-proxy# Check service status
systemctl --user status ollama-nothink-proxy
# View logs
journalctl --user -u ollama-nothink-proxy -f
# Test proxy is working
curl http://localhost:11435/api/tagsAdd to your clawdbot.json:
{
"models": {
"providers": {
"ollama": {
"baseUrl": "http://127.0.0.1:11435/v1",
"apiKey": "ollama-local",
"api": "openai-completions",
"timeout": 120000,
"models": [
{
"id": "glm-4.7-flash:q8_0",
"name": "GLM 4.7 Flash",
"contextWindow": 128000,
"maxTokens": 8192
}
]
}
}
},
"auth": {
"profiles": {
"ollama:default": {
"provider": "ollama",
"mode": "token"
}
}
},
"agents": {
"defaults": {
"models": {
"ollama/glm-4.7-flash:q8_0": {}
}
}
}
}Then restart Clawdbot:
clawdbot gateway restartSet your client's OpenAI base URL to:
http://localhost:11435/v1
Use any string for the API key (e.g., ollama-local).
- Client sends OpenAI-format request to
http://localhost:11435/v1/chat/completions - Proxy converts to Ollama native format and injects
think: false+num_ctx: 32768 - Ollama processes without thinking mode
- Proxy converts response back to OpenAI format
- Client receives standard response
- Startup: Waits up to 60s for Ollama
- Periodic: Health check every 30s
- Auto-recovery: Exits if Ollama down >2min (systemd restarts)
| Issue | Solution |
|---|---|
| "No auth profile for ollama" | Add ollama:default auth profile |
| 100% CPU / slow responses | Verify proxy port (11435) not 11434 |
| Context overflow crashes | Check num_ctx: 32768 in logs |
| Timeout errors | Increase REQUEST_TIMEOUT_MS in script |
journalctl --user -u ollama-nothink-proxy -fWith this proxy (GLM-4.7 q8_0 on Ryzen AI 9 HX 370):
- Response time: 5-15s (vs 2-5min with thinking)
- CPU: Normal (vs 100% sustained)
- Memory: ~35GB with 32k context
- Timeout: 120s (prevents infinite hangs)
MIT License β Free to use, modify, and distribute.