Skip to content

Instantly share code, notes, and snippets.

@supertask
Last active December 31, 2025 05:20
Show Gist options
  • Select an option

  • Save supertask/4ec31e7cf6339e7a41efd915962b5011 to your computer and use it in GitHub Desktop.

Select an option

Save supertask/4ec31e7cf6339e7a41efd915962b5011 to your computer and use it in GitHub Desktop.
Deplay file for unsloth Qwen3-VL 8B on Modal
import modal
import os
import io
# Safetensorsモデル (Transformers) を使用するための設定
MODEL_NAME = "unsloth/Qwen3-VL-8B-Instruct"
# モデルキャッシュ用のVolume
VOLUME_NAME = "qwen3-models-safetensors"
MODEL_CACHE_DIR = "/vol/models"
# 必要なライブラリをインストールしたイメージを定義
image = (
modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
.apt_install("git")
.pip_install("packaging", "ninja", "wheel", "setuptools") # Build dependencies
.pip_install(
"torch==2.4.0",
"torchvision==0.19.0",
"accelerate",
"pillow",
"requests",
"git+https://github.com/huggingface/transformers"
)
# Pre-built Flash Attention 2 wheel for PyTorch 2.4.0 / CUDA 12.1 / Python 3.10
# この組み合わせは非常に安定しており、Unsloth等でも推奨される構成です
.pip_install(
"https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"
)
)
app = modal.App("maisoku-analyzer-safetensors")
volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
@app.cls(
image=image,
gpu="L4", # A10Gの代わりにL4を使用 (VRAM 24GBで同等、可用性が高い場合がある)
volumes={"/vol": volume},
timeout=600, # 処理タイムアウト
scaledown_window=300, # コンテナ維持時間
container_idle_timeout=300
)
class Model:
@modal.enter()
def load_model(self):
print("Container started. Initializing model loading...")
import torch
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
print(f"Loading model: {MODEL_NAME}")
# モデルとプロセッサのロード
# Volumeをキャッシュディレクトリとして指定し、次回以降のロードを高速化
self.processor = AutoProcessor.from_pretrained(
MODEL_NAME,
cache_dir=MODEL_CACHE_DIR,
trust_remote_code=True
)
self.model = Qwen3VLForConditionalGeneration.from_pretrained(
MODEL_NAME,
cache_dir=MODEL_CACHE_DIR,
torch_dtype=torch.bfloat16,
device_map="auto",
attn_implementation="flash_attention_2", # Flash Attention有効化
trust_remote_code=True
)
print("Model loaded successfully")
@modal.method()
def analyze_image(self, image_bytes: bytes, prompt_text: str):
from PIL import Image
import torch
import time
start_time = time.time()
print("Start processing image...")
# バイト列から画像を読み込み
try:
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
except Exception as e:
print(f"Error loading image: {e}")
raise e
print(f"Image loaded: {image.size}. Time: {time.time() - start_time:.2f}s")
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": prompt_text},
],
}
]
# 推論用入力の作成
# 最新のTransformersではapply_chat_templateで画像も処理可能
inputs = self.processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
inputs = inputs.to(self.model.device)
print(f"Inputs prepared. Time: {time.time() - start_time:.2f}s")
# 生成実行
generated_ids = self.model.generate(
**inputs,
max_new_tokens=2048,
temperature=0.2,
do_sample=True
)
print(f"Generation completed. Time: {time.time() - start_time:.2f}s")
# 生成結果のトリミングとデコード
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = self.processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
return output_text[0]
# ローカルテスト用コマンド
# modal run land_purchase/maisoku_converter/backend_ai_modal/modal_deploy.py
@app.local_entrypoint()
def main():
import requests
# テスト用画像 (Qwen-VLのデモ画像)
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
print(f"Downloading sample image from {image_url}...")
try:
response = requests.get(image_url)
response.raise_for_status()
image_bytes = response.content
except Exception as e:
print(f"Failed to download sample image: {e}")
return
prompt = "この画像を日本語で詳しく描写してください。"
print("Sending request to Modal app...")
try:
model = Model()
# リモート実行
result = model.analyze_image.remote(image_bytes, prompt)
print("\nResult:")
print(result)
except Exception as e:
print(f"Execution failed: {e}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment