SomeBottle · April 9, 2025 09:46
diff --git a/web_api.py b/web_api.py
 # Copyright (c) 2023-2024 DeepSeek.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy of
 # this software and associated documentation files (the "Software"), to deal in
 # the Software without restriction, including without limitation the rights to
 # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 # the Software, and to permit persons to whom the Software is furnished to do so,
 # subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

 # -*- coding:utf-8 -*-
 from argparse import ArgumentParser
 import base64
 import io
 import os
 import sys
 import time
 import json
 from typing import List, Dict, Any, Optional, Union, AsyncGenerator
 import uuid

 import torch
 from fastapi import FastAPI, Request, HTTPException
 from fastapi.responses import JSONResponse, StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 import uvicorn
 from PIL import Image

 # Import DeepSeek VL2 components
 from deepseek_vl2.serve.app_modules.utils import configure_logger, strip_stop_words
 from deepseek_vl2.serve.inference import (
    convert_conversation_to_prompts,
    deepseek_generate,
    load_model,
 )

 logger = configure_logger()

 MODELS = [
    "DeepSeek-VL2-tiny",
    "DeepSeek-VL2-small",
    "DeepSeek-VL2",
    "deepseek-ai/deepseek-vl2-tiny",
    "deepseek-ai/deepseek-vl2-small",
    "deepseek-ai/deepseek-vl2",
 ]

 IMAGE_TOKEN = "<image>"

 # Create FastAPI app
 app = FastAPI(title="DeepSeek-VL2 API")

 # Add CORS middleware
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )

 # Global model storage
 MODEL_CACHE = {}

 def get_model(model_name):
    """Get or load model from cache"""
    global MODEL_CACHE, args
    
    if model_name in MODEL_CACHE:
        return MODEL_CACHE[model_name]
    
    model_path = args.local_path if args.local_path else model_name
    
    print(f"Loading DeepSeek-VL2 model from {model_path}...")
    tokenizer, vl_gpt, vl_chat_processor = load_model(model_path)
    MODEL_CACHE[model_name] = (tokenizer, vl_gpt, vl_chat_processor)
    print(f"Model loaded successfully.")
    
    return tokenizer, vl_gpt, vl_chat_processor

 def process_messages(messages):
    """Extract text and images from messages"""
    text = ""
    images = []
    
    # Process all messages, prioritizing the last user message
    user_messages = [msg for msg in messages if msg["role"] == "user"]
    if not user_messages:
        return text, images
    
    last_message = user_messages[-1]
    content = last_message["content"]
    
    if isinstance(content, str):
        text = content
    elif isinstance(content, list):
        for part in content:
            if part.get("type") == "text":
                text += part.get("text", "")
            elif part.get("type") == "image_url":
                image_data = part.get("image_url", {})
                if "url" in image_data:
                    url = image_data["url"]
                    if url.startswith("data:image"):
                        try:
                            base64_data = url.split(",")[1]
                            img_bytes = base64.b64decode(base64_data)
                            img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
                            images.append(img)
                        except Exception as e:
                            logger.error(f"Error processing image: {e}")
    
    return text, images

 @app.post("/v1/chat/completions")
 async def chat_completion(request: Request):
    """OpenAI-compatible chat completion endpoint"""
    try:
        body = await request.json()
        
        # Extract parameters
        model = body.get("model", args.model_name)
        messages = body.get("messages", [])
        temperature = float(body.get("temperature", 0.1))
        top_p = float(body.get("top_p", 0.9))
        max_tokens = int(body.get("max_tokens", 2048))
        stream = bool(body.get("stream", False))
        repetition_penalty = float(body.get("repetition_penalty", 1.1))
        
        if not model in MODELS:
            raise HTTPException(
                status_code=400, 
                detail=f"Model {model} not found. Available models: {', '.join(MODELS)}"
            )
        
        # Get model
        tokenizer, vl_gpt, vl_chat_processor = get_model(model)
        
        # Process images and text
        text, images = process_messages(messages)
        
        if not text:
            raise HTTPException(status_code=400, detail="No text content provided")
        
        # Format the prompt
        if images and len(images) > 0:
            num_images = len(images)
            image_tokens = "\n".join([IMAGE_TOKEN] * num_images)
            text = image_tokens + "\n" + text
            text_with_images = (text, images)
        else:
            text_with_images = text
        
        # Initialize conversation
        conversation = vl_chat_processor.new_chat_template()
        conversation.append_message(conversation.roles[0], text_with_images)
        conversation.append_message(conversation.roles[1], "")
        
        # Convert to format expected by DeepSeek generator
        all_conv, last_image = convert_conversation_to_prompts(conversation)
        stop_words = conversation.stop_str
        
        response_id = f"chatcmpl-{uuid.uuid4().hex}"
        created_time = int(time.time())
        
        if stream:
            # Handle streaming response
            async def generate_stream():
                # Start event - empty role delta
                yield f"data: {json.dumps({'id': response_id, 'object': 'chat.completion.chunk', 'created': created_time, 'model': model, 'choices': [{'index': 0, 'delta': {'role': 'assistant'}, 'finish_reason': None}]})}\n\n"
                
                full_response = ""
                with torch.no_grad():
                    for chunk in deepseek_generate(
                        conversations=all_conv,
                        vl_gpt=vl_gpt,
                        vl_chat_processor=vl_chat_processor,
                        tokenizer=tokenizer,
                        stop_words=stop_words,
                        max_length=max_tokens,
                        temperature=temperature,
                        repetition_penalty=repetition_penalty,
                        top_p=top_p,
                        chunk_size=args.chunk_size
                    ):
                        full_response += chunk
                        
                        # Send chunk
                        chunk_data = {
                            'id': response_id,
                            'object': 'chat.completion.chunk',
                            'created': created_time,
                            'model': model,
                            'choices': [{'index': 0, 'delta': {'content': chunk}, 'finish_reason': None}]
                        }
                        yield f"data: {json.dumps(chunk_data)}\n\n"
                
                # End event - empty delta with finish reason
                yield f"data: {json.dumps({'id': response_id, 'object': 'chat.completion.chunk', 'created': created_time, 'model': model, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"
                yield "data: [DONE]\n\n"
                
                # Clean up
                torch.cuda.empty_cache()
            
            return StreamingResponse(generate_stream(), media_type="text/event-stream")
        else:
            # Non-streaming response
            full_response = ""
            with torch.no_grad():
                for chunk in deepseek_generate(
                    conversations=all_conv,
                    vl_gpt=vl_gpt,
                    vl_chat_processor=vl_chat_processor,
                    tokenizer=tokenizer,
                    stop_words=stop_words,
                    max_length=max_tokens,
                    temperature=temperature,
                    repetition_penalty=repetition_penalty,
                    top_p=top_p,
                    chunk_size=args.chunk_size
                ):
                    full_response += chunk
            
            response = strip_stop_words(full_response, stop_words)
            conversation.update_last_message(response)
            
            # Simple token counting
            prompt_tokens = len(text) // 4
            completion_tokens = len(response) // 4
            
            # Clean up
            torch.cuda.empty_cache()
            
            # Return OpenAI-style response
            return {
                "id": response_id,
                "object": "chat.completion",
                "created": created_time,
                "model": model,
                "choices": [
                    {
                        "index": 0,
                        "message": {
                            "role": "assistant",
                            "content": response
                        },
                        "finish_reason": "stop"
                    }
                ],
                "usage": {
                    "prompt_tokens": prompt_tokens,
                    "completion_tokens": completion_tokens,
                    "total_tokens": prompt_tokens + completion_tokens
                }
            }
    
    except Exception as e:
        logger.error(f"Error in chat_completion: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))

 @app.get("/v1/models")
 async def list_models():
    """List available models endpoint"""
    models_list = [{"id": model, "object": "model"} for model in MODELS]
    return {"object": "list", "data": models_list}

 @app.get("/health")
 async def health_check():
    """Health check endpoint"""
    return {"status": "ok"}

 if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("--model_name", type=str, default="deepseek-ai/deepseek-vl2-tiny", required=False, choices=MODELS, help="model name")
    parser.add_argument("--local_path", type=str, default="", help="huggingface ckpt, optional")
    parser.add_argument("--host", type=str, default="0.0.0.0", help="host address")
    parser.add_argument("--port", type=int, default=37913, help="port number")
    parser.add_argument("--chunk_size", type=int, default=-1,
                        help="chunk size for the model for prefilling")
    args = parser.parse_args()
    
    # Preload default model if desired
    if not hasattr(args, 'lazy_load') or not args.lazy_load:
        get_model(args.model_name)
    
    # Start the server
    uvicorn.run(app, host=args.host, port=args.port)
	# Copyright (c) 2023-2024 DeepSeek.
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy of
	# this software and associated documentation files (the "Software"), to deal in
	# the Software without restriction, including without limitation the rights to
	# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
	# the Software, and to permit persons to whom the Software is furnished to do so,
	# subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
	# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
	# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
	# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

	# -- coding:utf-8 --
	from argparse import ArgumentParser
	import base64
	import io
	import os
	import sys
	import time
	import json
	from typing import List, Dict, Any, Optional, Union, AsyncGenerator
	import uuid

	import torch
	from fastapi import FastAPI, Request, HTTPException
	from fastapi.responses import JSONResponse, StreamingResponse
	from fastapi.middleware.cors import CORSMiddleware
	import uvicorn
	from PIL import Image

	# Import DeepSeek VL2 components
	from deepseek_vl2.serve.app_modules.utils import configure_logger, strip_stop_words
	from deepseek_vl2.serve.inference import (
	convert_conversation_to_prompts,
	deepseek_generate,
	load_model,
	)

	logger = configure_logger()

	MODELS = [
	"DeepSeek-VL2-tiny",
	"DeepSeek-VL2-small",
	"DeepSeek-VL2",
	"deepseek-ai/deepseek-vl2-tiny",
	"deepseek-ai/deepseek-vl2-small",
	"deepseek-ai/deepseek-vl2",
	]

	IMAGE_TOKEN = "<image>"

	# Create FastAPI app
	app = FastAPI(title="DeepSeek-VL2 API")

	# Add CORS middleware
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Global model storage
	MODEL_CACHE = {}

	def get_model(model_name):
	"""Get or load model from cache"""
	global MODEL_CACHE, args

	if model_name in MODEL_CACHE:
	return MODEL_CACHE[model_name]

	model_path = args.local_path if args.local_path else model_name

	print(f"Loading DeepSeek-VL2 model from {model_path}...")
	tokenizer, vl_gpt, vl_chat_processor = load_model(model_path)
	MODEL_CACHE[model_name] = (tokenizer, vl_gpt, vl_chat_processor)
	print(f"Model loaded successfully.")

	return tokenizer, vl_gpt, vl_chat_processor

	def process_messages(messages):
	"""Extract text and images from messages"""
	text = ""
	images = []

	# Process all messages, prioritizing the last user message
	user_messages = [msg for msg in messages if msg["role"] == "user"]
	if not user_messages:
	return text, images

	last_message = user_messages[-1]
	content = last_message["content"]

	if isinstance(content, str):
	text = content
	elif isinstance(content, list):
	for part in content:
	if part.get("type") == "text":
	text += part.get("text", "")
	elif part.get("type") == "image_url":
	image_data = part.get("image_url", {})
	if "url" in image_data:
	url = image_data["url"]
	if url.startswith("data:image"):
	try:
	base64_data = url.split(",")[1]
	img_bytes = base64.b64decode(base64_data)
	img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
	images.append(img)
	except Exception as e:
	logger.error(f"Error processing image: {e}")

	return text, images

	@app.post("/v1/chat/completions")
	async def chat_completion(request: Request):
	"""OpenAI-compatible chat completion endpoint"""
	try:
	body = await request.json()

	# Extract parameters
	model = body.get("model", args.model_name)
	messages = body.get("messages", [])
	temperature = float(body.get("temperature", 0.1))
	top_p = float(body.get("top_p", 0.9))
	max_tokens = int(body.get("max_tokens", 2048))
	stream = bool(body.get("stream", False))
	repetition_penalty = float(body.get("repetition_penalty", 1.1))

	if not model in MODELS:
	raise HTTPException(
	status_code=400,
	detail=f"Model {model} not found. Available models: {', '.join(MODELS)}"
	)

	# Get model
	tokenizer, vl_gpt, vl_chat_processor = get_model(model)

	# Process images and text
	text, images = process_messages(messages)

	if not text:
	raise HTTPException(status_code=400, detail="No text content provided")

	# Format the prompt
	if images and len(images) > 0:
	num_images = len(images)
	image_tokens = "\n".join([IMAGE_TOKEN] * num_images)
	text = image_tokens + "\n" + text
	text_with_images = (text, images)
	else:
	text_with_images = text

	# Initialize conversation
	conversation = vl_chat_processor.new_chat_template()
	conversation.append_message(conversation.roles[0], text_with_images)
	conversation.append_message(conversation.roles[1], "")

	# Convert to format expected by DeepSeek generator
	all_conv, last_image = convert_conversation_to_prompts(conversation)
	stop_words = conversation.stop_str

	response_id = f"chatcmpl-{uuid.uuid4().hex}"
	created_time = int(time.time())

	if stream:
	# Handle streaming response
	async def generate_stream():
	# Start event - empty role delta
	yield f"data: {json.dumps({'id': response_id, 'object': 'chat.completion.chunk', 'created': created_time, 'model': model, 'choices': [{'index': 0, 'delta': {'role': 'assistant'}, 'finish_reason': None}]})}\n\n"

	full_response = ""
	with torch.no_grad():
	for chunk in deepseek_generate(
	conversations=all_conv,
	vl_gpt=vl_gpt,
	vl_chat_processor=vl_chat_processor,
	tokenizer=tokenizer,
	stop_words=stop_words,
	max_length=max_tokens,
	temperature=temperature,
	repetition_penalty=repetition_penalty,
	top_p=top_p,
	chunk_size=args.chunk_size
	):
	full_response += chunk

	# Send chunk
	chunk_data = {
	'id': response_id,
	'object': 'chat.completion.chunk',
	'created': created_time,
	'model': model,
	'choices': [{'index': 0, 'delta': {'content': chunk}, 'finish_reason': None}]
	}
	yield f"data: {json.dumps(chunk_data)}\n\n"

	# End event - empty delta with finish reason
	yield f"data: {json.dumps({'id': response_id, 'object': 'chat.completion.chunk', 'created': created_time, 'model': model, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"
	yield "data: [DONE]\n\n"

	# Clean up
	torch.cuda.empty_cache()

	return StreamingResponse(generate_stream(), media_type="text/event-stream")
	else:
	# Non-streaming response
	full_response = ""
	with torch.no_grad():
	for chunk in deepseek_generate(
	conversations=all_conv,
	vl_gpt=vl_gpt,
	vl_chat_processor=vl_chat_processor,
	tokenizer=tokenizer,
	stop_words=stop_words,
	max_length=max_tokens,
	temperature=temperature,
	repetition_penalty=repetition_penalty,
	top_p=top_p,
	chunk_size=args.chunk_size
	):
	full_response += chunk

	response = strip_stop_words(full_response, stop_words)
	conversation.update_last_message(response)

	# Simple token counting
	prompt_tokens = len(text) // 4
	completion_tokens = len(response) // 4

	# Clean up
	torch.cuda.empty_cache()

	# Return OpenAI-style response
	return {
	"id": response_id,
	"object": "chat.completion",
	"created": created_time,
	"model": model,
	"choices": [
	{
	"index": 0,
	"message": {
	"role": "assistant",
	"content": response
	},
	"finish_reason": "stop"
	}
	],
	"usage": {
	"prompt_tokens": prompt_tokens,
	"completion_tokens": completion_tokens,
	"total_tokens": prompt_tokens + completion_tokens
	}
	}

	except Exception as e:
	logger.error(f"Error in chat_completion: {str(e)}")
	raise HTTPException(status_code=500, detail=str(e))

	@app.get("/v1/models")
	async def list_models():
	"""List available models endpoint"""
	models_list = [{"id": model, "object": "model"} for model in MODELS]
	return {"object": "list", "data": models_list}

	@app.get("/health")
	async def health_check():
	"""Health check endpoint"""
	return {"status": "ok"}

	if __name__ == "__main__":
	parser = ArgumentParser()
	parser.add_argument("--model_name", type=str, default="deepseek-ai/deepseek-vl2-tiny", required=False, choices=MODELS, help="model name")
	parser.add_argument("--local_path", type=str, default="", help="huggingface ckpt, optional")
	parser.add_argument("--host", type=str, default="0.0.0.0", help="host address")
	parser.add_argument("--port", type=int, default=37913, help="port number")
	parser.add_argument("--chunk_size", type=int, default=-1,
	help="chunk size for the model for prefilling")
	args = parser.parse_args()

	# Preload default model if desired
	if not hasattr(args, 'lazy_load') or not args.lazy_load:
	get_model(args.model_name)

	# Start the server
	uvicorn.run(app, host=args.host, port=args.port)
No results found