pszemraj · November 5, 2025 04:51
diff --git a/inference_example_lfm2vl_3b.py b/inference_example_lfm2vl_3b.py
 """
 example script for inference with LFM2-VL-3B model

 https://hf.co/LiquidAI/LFM2-VL-3B
 """

 from transformers import AutoModelForImageTextToText, AutoProcessor
 from transformers.image_utils import load_image

 # Load model and processor
 model_id = "LiquidAI/LFM2-VL-3B"
 processor = AutoProcessor.from_pretrained(model_id)
 model = AutoModelForImageTextToText.from_pretrained(
    model_id, device_map="auto", dtype="bfloat16"
 )

 # Load image and create conversation
 url = "https://www.ilankelman.org/stopsigns/australia.jpg"
 image = load_image(url)
 conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "What is in this image?"},
        ],
    },
 ]

 # Generate Answer
 text_kwargs = dict(
    do_sample=True,
    temperature=0.1,
    min_p=0.15,
    repetition_penalty=1.05,
    max_new_tokens=128,
 )
 vision_kwargs = dict(
    min_image_tokens=64,
    max_image_tokens=256,
    do_image_splitting=True,
 )

 inputs = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True,
    tokenize=True,
    **vision_kwargs,
 ).to(model.device)
 outputs = model.generate(**inputs, **text_kwargs)
 # drop the prompt part; keep only newly generated tokens
 prompt_len = inputs["input_ids"].shape[-1]
 gen_ids = outputs[:, prompt_len:]

 print(processor.batch_decode(gen_ids, skip_special_tokens=True)[0])

 # This image captures a vibrant street scene in a Chinatown area. The focal point is a large red Chinese archway with gold and black accents, adorned with Chinese characters. Flanking the archway are two white stone lion statues, which are traditional guardians in Chinese culture.
	"""
	example script for inference with LFM2-VL-3B model

	https://hf.co/LiquidAI/LFM2-VL-3B
	"""

	from transformers import AutoModelForImageTextToText, AutoProcessor
	from transformers.image_utils import load_image

	# Load model and processor
	model_id = "LiquidAI/LFM2-VL-3B"
	processor = AutoProcessor.from_pretrained(model_id)
	model = AutoModelForImageTextToText.from_pretrained(
	model_id, device_map="auto", dtype="bfloat16"
	)

	# Load image and create conversation
	url = "https://www.ilankelman.org/stopsigns/australia.jpg"
	image = load_image(url)
	conversation = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": "What is in this image?"},
	],
	},
	]

	# Generate Answer
	text_kwargs = dict(
	do_sample=True,
	temperature=0.1,
	min_p=0.15,
	repetition_penalty=1.05,
	max_new_tokens=128,
	)
	vision_kwargs = dict(
	min_image_tokens=64,
	max_image_tokens=256,
	do_image_splitting=True,
	)

	inputs = processor.apply_chat_template(
	conversation,
	add_generation_prompt=True,
	return_tensors="pt",
	return_dict=True,
	tokenize=True,
	**vision_kwargs,
	).to(model.device)
	outputs = model.generate(inputs, text_kwargs)
	# drop the prompt part; keep only newly generated tokens
	prompt_len = inputs["input_ids"].shape[-1]
	gen_ids = outputs[:, prompt_len:]

	print(processor.batch_decode(gen_ids, skip_special_tokens=True)[0])

	# This image captures a vibrant street scene in a Chinatown area. The focal point is a large red Chinese archway with gold and black accents, adorned with Chinese characters. Flanking the archway are two white stone lion statues, which are traditional guardians in Chinese culture.
No results found