supertask · December 31, 2025 05:20
diff --git a/modal_unsloth_Qwen3VL_8B_deplay.py b/modal_unsloth_Qwen3VL_8B_deplay.py
 import modal
 import os
 import io

 # Safetensorsモデル (Transformers) を使用するための設定
 MODEL_NAME = "unsloth/Qwen3-VL-8B-Instruct"
 # モデルキャッシュ用のVolume
 VOLUME_NAME = "qwen3-models-safetensors"
 MODEL_CACHE_DIR = "/vol/models"

 # 必要なライブラリをインストールしたイメージを定義
 image = (
    modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
    .apt_install("git")
    .pip_install("packaging", "ninja", "wheel", "setuptools")  # Build dependencies
    .pip_install(
        "torch==2.4.0",
        "torchvision==0.19.0",
        "accelerate",
        "pillow",
        "requests",
        "git+https://github.com/huggingface/transformers"
    )
    # Pre-built Flash Attention 2 wheel for PyTorch 2.4.0 / CUDA 12.1 / Python 3.10
    # この組み合わせは非常に安定しており、Unsloth等でも推奨される構成です
    .pip_install(
        "https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"
    )
 )

 app = modal.App("maisoku-analyzer-safetensors")
 volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)

 @app.cls(
    image=image,
    gpu="L4",  # A10Gの代わりにL4を使用 (VRAM 24GBで同等、可用性が高い場合がある)
    volumes={"/vol": volume},
    timeout=600,  # 処理タイムアウト
    scaledown_window=300,  # コンテナ維持時間
    container_idle_timeout=300 
 )
 class Model:
    @modal.enter()
    def load_model(self):
        print("Container started. Initializing model loading...")
        import torch
        from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
        
        print(f"Loading model: {MODEL_NAME}")
        
        # モデルとプロセッサのロード
        # Volumeをキャッシュディレクトリとして指定し、次回以降のロードを高速化
        self.processor = AutoProcessor.from_pretrained(
            MODEL_NAME, 
            cache_dir=MODEL_CACHE_DIR,
            trust_remote_code=True
        )
        
        self.model = Qwen3VLForConditionalGeneration.from_pretrained(
            MODEL_NAME,
            cache_dir=MODEL_CACHE_DIR,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            attn_implementation="flash_attention_2", # Flash Attention有効化
            trust_remote_code=True
        )
        print("Model loaded successfully")

    @modal.method()
    def analyze_image(self, image_bytes: bytes, prompt_text: str):
        from PIL import Image
        import torch
        import time
        
        start_time = time.time()
        print("Start processing image...")
        
        # バイト列から画像を読み込み
        try:
            image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        except Exception as e:
            print(f"Error loading image: {e}")
            raise e
        
        print(f"Image loaded: {image.size}. Time: {time.time() - start_time:.2f}s")
        
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": prompt_text},
                ],
            }
        ]
        
        # 推論用入力の作成
        # 最新のTransformersではapply_chat_templateで画像も処理可能
        inputs = self.processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        )
        inputs = inputs.to(self.model.device)
        print(f"Inputs prepared. Time: {time.time() - start_time:.2f}s")
        
        # 生成実行
        generated_ids = self.model.generate(
            **inputs,
            max_new_tokens=2048,
            temperature=0.2,
            do_sample=True 
        )
        print(f"Generation completed. Time: {time.time() - start_time:.2f}s")
        
        # 生成結果のトリミングとデコード
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = self.processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        
        return output_text[0]

 # ローカルテスト用コマンド
 # modal run land_purchase/maisoku_converter/backend_ai_modal/modal_deploy.py
 @app.local_entrypoint()
 def main():
    import requests
    
    # テスト用画像 (Qwen-VLのデモ画像)
    image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
    print(f"Downloading sample image from {image_url}...")
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        image_bytes = response.content
    except Exception as e:
        print(f"Failed to download sample image: {e}")
        return
    
    prompt = "この画像を日本語で詳しく描写してください。"
    
    print("Sending request to Modal app...")
    try:
        model = Model()
        # リモート実行
        result = model.analyze_image.remote(image_bytes, prompt)
        print("\nResult:")
        print(result)
    except Exception as e:
        print(f"Execution failed: {e}")
	import modal
	import os
	import io

	# Safetensorsモデル (Transformers) を使用するための設定
	MODEL_NAME = "unsloth/Qwen3-VL-8B-Instruct"
	# モデルキャッシュ用のVolume
	VOLUME_NAME = "qwen3-models-safetensors"
	MODEL_CACHE_DIR = "/vol/models"

	# 必要なライブラリをインストールしたイメージを定義
	image = (
	modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
	.apt_install("git")
	.pip_install("packaging", "ninja", "wheel", "setuptools") # Build dependencies
	.pip_install(
	"torch==2.4.0",
	"torchvision==0.19.0",
	"accelerate",
	"pillow",
	"requests",
	"git+https://github.com/huggingface/transformers"
	)
	# Pre-built Flash Attention 2 wheel for PyTorch 2.4.0 / CUDA 12.1 / Python 3.10
	# この組み合わせは非常に安定しており、Unsloth等でも推奨される構成です
	.pip_install(
	"https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"
	)
	)

	app = modal.App("maisoku-analyzer-safetensors")
	volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)

	@app.cls(
	image=image,
	gpu="L4", # A10Gの代わりにL4を使用 (VRAM 24GBで同等、可用性が高い場合がある)
	volumes={"/vol": volume},
	timeout=600, # 処理タイムアウト
	scaledown_window=300, # コンテナ維持時間
	container_idle_timeout=300
	)
	class Model:
	@modal.enter()
	def load_model(self):
	print("Container started. Initializing model loading...")
	import torch
	from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

	print(f"Loading model: {MODEL_NAME}")

	# モデルとプロセッサのロード
	# Volumeをキャッシュディレクトリとして指定し、次回以降のロードを高速化
	self.processor = AutoProcessor.from_pretrained(
	MODEL_NAME,
	cache_dir=MODEL_CACHE_DIR,
	trust_remote_code=True
	)

	self.model = Qwen3VLForConditionalGeneration.from_pretrained(
	MODEL_NAME,
	cache_dir=MODEL_CACHE_DIR,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	attn_implementation="flash_attention_2", # Flash Attention有効化
	trust_remote_code=True
	)
	print("Model loaded successfully")

	@modal.method()
	def analyze_image(self, image_bytes: bytes, prompt_text: str):
	from PIL import Image
	import torch
	import time

	start_time = time.time()
	print("Start processing image...")

	# バイト列から画像を読み込み
	try:
	image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
	except Exception as e:
	print(f"Error loading image: {e}")
	raise e

	print(f"Image loaded: {image.size}. Time: {time.time() - start_time:.2f}s")

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": prompt_text},
	],
	}
	]

	# 推論用入力の作成
	# 最新のTransformersではapply_chat_templateで画像も処理可能
	inputs = self.processor.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt"
	)
	inputs = inputs.to(self.model.device)
	print(f"Inputs prepared. Time: {time.time() - start_time:.2f}s")

	# 生成実行
	generated_ids = self.model.generate(
	**inputs,
	max_new_tokens=2048,
	temperature=0.2,
	do_sample=True
	)
	print(f"Generation completed. Time: {time.time() - start_time:.2f}s")

	# 生成結果のトリミングとデコード
	generated_ids_trimmed = [
	out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = self.processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)

	return output_text[0]

	# ローカルテスト用コマンド
	# modal run land_purchase/maisoku_converter/backend_ai_modal/modal_deploy.py
	@app.local_entrypoint()
	def main():
	import requests

	# テスト用画像 (Qwen-VLのデモ画像)
	image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
	print(f"Downloading sample image from {image_url}...")
	try:
	response = requests.get(image_url)
	response.raise_for_status()
	image_bytes = response.content
	except Exception as e:
	print(f"Failed to download sample image: {e}")
	return

	prompt = "この画像を日本語で詳しく描写してください。"

	print("Sending request to Modal app...")
	try:
	model = Model()
	# リモート実行
	result = model.analyze_image.remote(image_bytes, prompt)
	print("\nResult:")
	print(result)
	except Exception as e:
	print(f"Execution failed: {e}")
No results found