Last active
December 31, 2025 05:20
-
-
Save supertask/4ec31e7cf6339e7a41efd915962b5011 to your computer and use it in GitHub Desktop.
Deplay file for unsloth Qwen3-VL 8B on Modal
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import modal | |
| import os | |
| import io | |
| # Safetensorsモデル (Transformers) を使用するための設定 | |
| MODEL_NAME = "unsloth/Qwen3-VL-8B-Instruct" | |
| # モデルキャッシュ用のVolume | |
| VOLUME_NAME = "qwen3-models-safetensors" | |
| MODEL_CACHE_DIR = "/vol/models" | |
| # 必要なライブラリをインストールしたイメージを定義 | |
| image = ( | |
| modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") | |
| .apt_install("git") | |
| .pip_install("packaging", "ninja", "wheel", "setuptools") # Build dependencies | |
| .pip_install( | |
| "torch==2.4.0", | |
| "torchvision==0.19.0", | |
| "accelerate", | |
| "pillow", | |
| "requests", | |
| "git+https://github.com/huggingface/transformers" | |
| ) | |
| # Pre-built Flash Attention 2 wheel for PyTorch 2.4.0 / CUDA 12.1 / Python 3.10 | |
| # この組み合わせは非常に安定しており、Unsloth等でも推奨される構成です | |
| .pip_install( | |
| "https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl" | |
| ) | |
| ) | |
| app = modal.App("maisoku-analyzer-safetensors") | |
| volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True) | |
| @app.cls( | |
| image=image, | |
| gpu="L4", # A10Gの代わりにL4を使用 (VRAM 24GBで同等、可用性が高い場合がある) | |
| volumes={"/vol": volume}, | |
| timeout=600, # 処理タイムアウト | |
| scaledown_window=300, # コンテナ維持時間 | |
| container_idle_timeout=300 | |
| ) | |
| class Model: | |
| @modal.enter() | |
| def load_model(self): | |
| print("Container started. Initializing model loading...") | |
| import torch | |
| from transformers import Qwen3VLForConditionalGeneration, AutoProcessor | |
| print(f"Loading model: {MODEL_NAME}") | |
| # モデルとプロセッサのロード | |
| # Volumeをキャッシュディレクトリとして指定し、次回以降のロードを高速化 | |
| self.processor = AutoProcessor.from_pretrained( | |
| MODEL_NAME, | |
| cache_dir=MODEL_CACHE_DIR, | |
| trust_remote_code=True | |
| ) | |
| self.model = Qwen3VLForConditionalGeneration.from_pretrained( | |
| MODEL_NAME, | |
| cache_dir=MODEL_CACHE_DIR, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", | |
| attn_implementation="flash_attention_2", # Flash Attention有効化 | |
| trust_remote_code=True | |
| ) | |
| print("Model loaded successfully") | |
| @modal.method() | |
| def analyze_image(self, image_bytes: bytes, prompt_text: str): | |
| from PIL import Image | |
| import torch | |
| import time | |
| start_time = time.time() | |
| print("Start processing image...") | |
| # バイト列から画像を読み込み | |
| try: | |
| image = Image.open(io.BytesIO(image_bytes)).convert("RGB") | |
| except Exception as e: | |
| print(f"Error loading image: {e}") | |
| raise e | |
| print(f"Image loaded: {image.size}. Time: {time.time() - start_time:.2f}s") | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": prompt_text}, | |
| ], | |
| } | |
| ] | |
| # 推論用入力の作成 | |
| # 最新のTransformersではapply_chat_templateで画像も処理可能 | |
| inputs = self.processor.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_dict=True, | |
| return_tensors="pt" | |
| ) | |
| inputs = inputs.to(self.model.device) | |
| print(f"Inputs prepared. Time: {time.time() - start_time:.2f}s") | |
| # 生成実行 | |
| generated_ids = self.model.generate( | |
| **inputs, | |
| max_new_tokens=2048, | |
| temperature=0.2, | |
| do_sample=True | |
| ) | |
| print(f"Generation completed. Time: {time.time() - start_time:.2f}s") | |
| # 生成結果のトリミングとデコード | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = self.processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| ) | |
| return output_text[0] | |
| # ローカルテスト用コマンド | |
| # modal run land_purchase/maisoku_converter/backend_ai_modal/modal_deploy.py | |
| @app.local_entrypoint() | |
| def main(): | |
| import requests | |
| # テスト用画像 (Qwen-VLのデモ画像) | |
| image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" | |
| print(f"Downloading sample image from {image_url}...") | |
| try: | |
| response = requests.get(image_url) | |
| response.raise_for_status() | |
| image_bytes = response.content | |
| except Exception as e: | |
| print(f"Failed to download sample image: {e}") | |
| return | |
| prompt = "この画像を日本語で詳しく描写してください。" | |
| print("Sending request to Modal app...") | |
| try: | |
| model = Model() | |
| # リモート実行 | |
| result = model.analyze_image.remote(image_bytes, prompt) | |
| print("\nResult:") | |
| print(result) | |
| except Exception as e: | |
| print(f"Execution failed: {e}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment