the-crypt-keeper · June 8, 2025 12:53
diff --git a/ruminate.md b/ruminate.md
diff --git a/ruminate.py b/ruminate.py
 import asyncio
 import aiohttp
 import json
 from typing import List, Dict, Any, Optional
 from dataclasses import dataclass
 from transformers import AutoTokenizer

 @dataclass
 class ReasoningConfig:
    initial_think: int
    soft_warn: int  
    hard_warn: int
    reason_initial_text: str = ""
    reason_soft_text: str = "\nConsidering the limited time by the user, I'd better make sure I'm on the right track."
    reason_hard_text: str = "\nConsidering the limited time by the user, let me summarize my thoughts and finish up."
    reason_terminate_text: str = "\nConsidering the limited time by the user, I have to give the solution based on the thinking directly now."

 class ReasoningProxy:
    def __init__(self, backend_url: str, model_name: str):
        self.backend_url = backend_url.rstrip('/')
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
    async def _complete(self, session: aiohttp.ClientSession, prompt: str, max_tokens: int, **kwargs) -> tuple[str, int]:
        """Make a completion request and return (completion_text, tokens_used)"""
        payload = {
            "prompt": prompt,
            "max_tokens": max_tokens,
            **kwargs
        }
        
        # print("upstream completion:", payload)
        
        async with session.post(f"{self.backend_url}/v1/completions", json=payload) as response:
            # print(await response.text())
            response.raise_for_status()
            result = await response.json()
            
        completion_text = result["choices"][0]["text"]
        tokens_used = result["usage"]["completion_tokens"]
        return completion_text, tokens_used
    
    async def process_chat_completion(self, messages: List[Dict], reason_control: List[int], 
                                    max_tokens: int, reason_initial_text: str = "",
                                    reason_soft_text: str = "\nConsidering the limited time by the user, I'd better make sure I'm on the right track.",
                                    reason_hard_text: str = "\nConsidering the limited time by the user, let me summarize my thoughts and finish up.",
                                    reason_terminate_text: str = "\nConsidering the limited time by the user, I have to give the solution based on the thinking directly now.",
                                    **kwargs) -> Dict[str, Any]:
        
        # Stage 0: Setup
        initial_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        initial_think, soft_warn, hard_warn = reason_control
        current_prompt = initial_prompt
        termination_reason = "natural"
        skip_to_answer = False
        
        async with aiohttp.ClientSession() as session:
            # Stage 1: THINK
            current_prompt += "\n<think>\n"
            if initial_think > 0 and not skip_to_answer:
                current_prompt += reason_initial_text
                completion, used_tokens = await self._complete(session, current_prompt, initial_think, **kwargs)
                current_prompt += completion
                
                if used_tokens < initial_think:
                    # Natural completion
                    skip_to_answer = True
                elif "</think>" in completion:
                    # Found end of thinking
                    skip_to_answer = True
                # Otherwise continue to SOFT stage
            else:
                current_prompt += "\n</think>\n"
                skip_to_answer = True
            
            # Stage 2: SOFT
            if soft_warn > 0 and not skip_to_answer:
                current_prompt += reason_soft_text
                completion, used_tokens = await self._complete(session, current_prompt, soft_warn, **kwargs)
                current_prompt += completion
                
                if used_tokens < soft_warn:
                    # Natural completion
                    skip_to_answer = True
                elif "</think>" in completion:
                    # Found end of thinking
                    skip_to_answer = True
                # Otherwise continue to HARD stage
            
            # Stage 3: HARD
            if hard_warn > 0 and not skip_to_answer:
                print('HARD reached: ', current_prompt)
                current_prompt += reason_hard_text
                completion, used_tokens = await self._complete(session, current_prompt, hard_warn, **kwargs)
                current_prompt += completion
                
                if used_tokens < hard_warn:
                    # Natural completion
                    skip_to_answer = True
                elif "</think>" in completion:
                    # Found end of thinking
                    skip_to_answer = True
                # Otherwise continue to TERMINATE stage
            
            # Stage 4: TERMINATE
            if not skip_to_answer:
                print('TERMINATE reached: ', current_prompt)
                
                current_prompt += reason_terminate_text + "\n</think>\n\n"
                completion, used_tokens = await self._complete(session, current_prompt, max_tokens, **kwargs)
                current_prompt += completion
                termination_reason = "hard_terminated"
            else:
                # Stage 5: ANSWER
                completion, used_tokens = await self._complete(session, current_prompt, max_tokens, **kwargs)
                current_prompt += completion
        
        # Stage 6: DONE - return response
        final_response = current_prompt[len(initial_prompt):]
        
        return {
            "id": f"chatcmpl-reasoning-proxy",
            "object": "chat.completion",
            "model": self.model_name,
            "choices": [{
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": final_response
                },
                "finish_reason": "stop"
            }],
            "usage": {
                "prompt_tokens": len(self.tokenizer.encode(initial_prompt)),
                "completion_tokens": len(self.tokenizer.encode(final_response)),
                "total_tokens": len(self.tokenizer.encode(current_prompt))
            },
            "termination_reason": termination_reason
        }

 # FastAPI server
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from typing import Optional

 app = FastAPI()

 # Global proxy instance - configure these for your setup
 BACKEND_URL = "http://localhost:3333"  # Your actual model server
 MODEL_NAME = "Qwen/Qwen3-4B"  # Update as needed

 proxy = ReasoningProxy(BACKEND_URL, MODEL_NAME)

 class ChatCompletionRequest(BaseModel):
    messages: List[Dict[str, str]]
    model: str
    max_tokens: int
    temperature: Optional[float] = None
    top_p: Optional[float] = None
    min_p: Optional[float] = None
    top_k: Optional[float] = None
    reason_control: Optional[List[int]] = [0, 0, 0]  # Default values
    reason_initial_text: Optional[str] = ""
    reason_soft_text: Optional[str] = "\nConsidering the limited time by the user, I'd better make sure I'm on the right track."
    reason_hard_text: Optional[str] = "\nConsidering the limited time by the user, let me summarize my thoughts and finish up."
    reason_terminate_text: Optional[str] = "\nConsidering the limited time by the user, I have to give the solution based on the thinking directly now."

 @app.post("/v1/chat/completions")
 async def chat_completions(request: ChatCompletionRequest):
    try:
        # Extract reasoning parameters
        reason_params = {
            "reason_control": request.reason_control,
            "reason_initial_text": request.reason_initial_text,
            "reason_soft_text": request.reason_soft_text, 
            "reason_hard_text": request.reason_hard_text,
            "reason_terminate_text": request.reason_terminate_text
        }
        
        # Extract other parameters (filter out reason_* and messages/max_tokens)
        other_params = {}
        for field_name, field_value in request.model_dump().items():
            if not field_name.startswith("reason_") and field_name not in ["messages", "max_tokens"]:
                if field_value is not None:
                    other_params[field_name] = field_value
        
        result = await proxy.process_chat_completion(
            messages=request.messages,
            max_tokens=request.max_tokens,
            **reason_params,
            **other_params
        )
        
        return result
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8001)
Configuration	Accuracy	95% CI	Avg Tokens	P95 Tokens
No reasoning	27.3%	±5.2%	3	3
Full reasoning	75.3%	±5.1%	2,036	4,532
Ruminate [200,200,200]	57.5%	±5.8%	568	686
Ruminate [1000,600,400]	68.8%	±5.7%	1,333	2,067
	import asyncio
	import aiohttp
	import json
	from typing import List, Dict, Any, Optional
	from dataclasses import dataclass
	from transformers import AutoTokenizer

	@dataclass
	class ReasoningConfig:
	initial_think: int
	soft_warn: int
	hard_warn: int
	reason_initial_text: str = ""
	reason_soft_text: str = "\nConsidering the limited time by the user, I'd better make sure I'm on the right track."
	reason_hard_text: str = "\nConsidering the limited time by the user, let me summarize my thoughts and finish up."
	reason_terminate_text: str = "\nConsidering the limited time by the user, I have to give the solution based on the thinking directly now."

	class ReasoningProxy:
	def __init__(self, backend_url: str, model_name: str):
	self.backend_url = backend_url.rstrip('/')
	self.model_name = model_name
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)

	async def _complete(self, session: aiohttp.ClientSession, prompt: str, max_tokens: int, **kwargs) -> tuple[str, int]:
	"""Make a completion request and return (completion_text, tokens_used)"""
	payload = {
	"prompt": prompt,
	"max_tokens": max_tokens,
	**kwargs
	}

	# print("upstream completion:", payload)

	async with session.post(f"{self.backend_url}/v1/completions", json=payload) as response:
	# print(await response.text())
	response.raise_for_status()
	result = await response.json()

	completion_text = result["choices"][0]["text"]
	tokens_used = result["usage"]["completion_tokens"]
	return completion_text, tokens_used

	async def process_chat_completion(self, messages: List[Dict], reason_control: List[int],
	max_tokens: int, reason_initial_text: str = "",
	reason_soft_text: str = "\nConsidering the limited time by the user, I'd better make sure I'm on the right track.",
	reason_hard_text: str = "\nConsidering the limited time by the user, let me summarize my thoughts and finish up.",
	reason_terminate_text: str = "\nConsidering the limited time by the user, I have to give the solution based on the thinking directly now.",
	**kwargs) -> Dict[str, Any]:

	# Stage 0: Setup
	initial_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

	initial_think, soft_warn, hard_warn = reason_control
	current_prompt = initial_prompt
	termination_reason = "natural"
	skip_to_answer = False

	async with aiohttp.ClientSession() as session:
	# Stage 1: THINK
	current_prompt += "\n<think>\n"
	if initial_think > 0 and not skip_to_answer:
	current_prompt += reason_initial_text
	completion, used_tokens = await self._complete(session, current_prompt, initial_think, **kwargs)
	current_prompt += completion

	if used_tokens < initial_think:
	# Natural completion
	skip_to_answer = True
	elif "</think>" in completion:
	# Found end of thinking
	skip_to_answer = True
	# Otherwise continue to SOFT stage
	else:
	current_prompt += "\n</think>\n"
	skip_to_answer = True

	# Stage 2: SOFT
	if soft_warn > 0 and not skip_to_answer:
	current_prompt += reason_soft_text
	completion, used_tokens = await self._complete(session, current_prompt, soft_warn, **kwargs)
	current_prompt += completion

	if used_tokens < soft_warn:
	# Natural completion
	skip_to_answer = True
	elif "</think>" in completion:
	# Found end of thinking
	skip_to_answer = True
	# Otherwise continue to HARD stage

	# Stage 3: HARD
	if hard_warn > 0 and not skip_to_answer:
	print('HARD reached: ', current_prompt)
	current_prompt += reason_hard_text
	completion, used_tokens = await self._complete(session, current_prompt, hard_warn, **kwargs)
	current_prompt += completion

	if used_tokens < hard_warn:
	# Natural completion
	skip_to_answer = True
	elif "</think>" in completion:
	# Found end of thinking
	skip_to_answer = True
	# Otherwise continue to TERMINATE stage

	# Stage 4: TERMINATE
	if not skip_to_answer:
	print('TERMINATE reached: ', current_prompt)

	current_prompt += reason_terminate_text + "\n</think>\n\n"
	completion, used_tokens = await self._complete(session, current_prompt, max_tokens, **kwargs)
	current_prompt += completion
	termination_reason = "hard_terminated"
	else:
	# Stage 5: ANSWER
	completion, used_tokens = await self._complete(session, current_prompt, max_tokens, **kwargs)
	current_prompt += completion

	# Stage 6: DONE - return response
	final_response = current_prompt[len(initial_prompt):]

	return {
	"id": f"chatcmpl-reasoning-proxy",
	"object": "chat.completion",
	"model": self.model_name,
	"choices": [{
	"index": 0,
	"message": {
	"role": "assistant",
	"content": final_response
	},
	"finish_reason": "stop"
	}],
	"usage": {
	"prompt_tokens": len(self.tokenizer.encode(initial_prompt)),
	"completion_tokens": len(self.tokenizer.encode(final_response)),
	"total_tokens": len(self.tokenizer.encode(current_prompt))
	},
	"termination_reason": termination_reason
	}

	# FastAPI server
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from typing import Optional

	app = FastAPI()

	# Global proxy instance - configure these for your setup
	BACKEND_URL = "http://localhost:3333" # Your actual model server
	MODEL_NAME = "Qwen/Qwen3-4B" # Update as needed

	proxy = ReasoningProxy(BACKEND_URL, MODEL_NAME)

	class ChatCompletionRequest(BaseModel):
	messages: List[Dict[str, str]]
	model: str
	max_tokens: int
	temperature: Optional[float] = None
	top_p: Optional[float] = None
	min_p: Optional[float] = None
	top_k: Optional[float] = None
	reason_control: Optional[List[int]] = [0, 0, 0] # Default values
	reason_initial_text: Optional[str] = ""
	reason_soft_text: Optional[str] = "\nConsidering the limited time by the user, I'd better make sure I'm on the right track."
	reason_hard_text: Optional[str] = "\nConsidering the limited time by the user, let me summarize my thoughts and finish up."
	reason_terminate_text: Optional[str] = "\nConsidering the limited time by the user, I have to give the solution based on the thinking directly now."

	@app.post("/v1/chat/completions")
	async def chat_completions(request: ChatCompletionRequest):
	try:
	# Extract reasoning parameters
	reason_params = {
	"reason_control": request.reason_control,
	"reason_initial_text": request.reason_initial_text,
	"reason_soft_text": request.reason_soft_text,
	"reason_hard_text": request.reason_hard_text,
	"reason_terminate_text": request.reason_terminate_text
	}

	# Extract other parameters (filter out reason_* and messages/max_tokens)
	other_params = {}
	for field_name, field_value in request.model_dump().items():
	if not field_name.startswith("reason_") and field_name not in ["messages", "max_tokens"]:
	if field_value is not None:
	other_params[field_name] = field_value

	result = await proxy.process_chat_completion(
	messages=request.messages,
	max_tokens=request.max_tokens,
	**reason_params,
	**other_params
	)

	return result

	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=8001)