Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Last active November 5, 2025 04:51
Show Gist options
  • Save pszemraj/a97e4720b186d7a4064f9aa40845b861 to your computer and use it in GitHub Desktop.
Save pszemraj/a97e4720b186d7a4064f9aa40845b861 to your computer and use it in GitHub Desktop.
inference with 3b
"""
example script for inference with LFM2-VL-3B model
https://hf.co/LiquidAI/LFM2-VL-3B
"""
from transformers import AutoModelForImageTextToText, AutoProcessor
from transformers.image_utils import load_image
# Load model and processor
model_id = "LiquidAI/LFM2-VL-3B"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForImageTextToText.from_pretrained(
model_id, device_map="auto", dtype="bfloat16"
)
# Load image and create conversation
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image = load_image(url)
conversation = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": "What is in this image?"},
],
},
]
# Generate Answer
text_kwargs = dict(
do_sample=True,
temperature=0.1,
min_p=0.15,
repetition_penalty=1.05,
max_new_tokens=128,
)
vision_kwargs = dict(
min_image_tokens=64,
max_image_tokens=256,
do_image_splitting=True,
)
inputs = processor.apply_chat_template(
conversation,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True,
tokenize=True,
**vision_kwargs,
).to(model.device)
outputs = model.generate(**inputs, **text_kwargs)
# drop the prompt part; keep only newly generated tokens
prompt_len = inputs["input_ids"].shape[-1]
gen_ids = outputs[:, prompt_len:]
print(processor.batch_decode(gen_ids, skip_special_tokens=True)[0])
# This image captures a vibrant street scene in a Chinatown area. The focal point is a large red Chinese archway with gold and black accents, adorned with Chinese characters. Flanking the archway are two white stone lion statues, which are traditional guardians in Chinese culture.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment