jnulzl · February 27, 2025 07:01 · jnulzl · Feb 27, 2025
diff --git a/demo_qwen_vl.py b/demo_qwen_vl.py
 import os
 import sys
 from transformers import AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info

 def main(model_dir, messages):
    if not os.path.exists(model_dir):
        raise Exception("%s directory don't exists!"%(model_dir))
        
    if "Qwen2-VL" in model_dir:    
        from transformers import Qwen2VLForConditionalGeneration as QwenVLForConditionalGeneration
    elif "Qwen2.5-VL" in model_dir:    
        from transformers import Qwen2_5_VLForConditionalGeneration  as QwenVLForConditionalGeneration
 
    print(QwenVLForConditionalGeneration)   
    # default: Load the model on the available device(s)
    model = QwenVLForConditionalGeneration.from_pretrained(
        model_dir, 
        attn_implementation="flash_attention_2",
        device_map="auto"
    )

    # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
    # model = QwenVLForConditionalGeneration.from_pretrained(
    #     model_dir,
    #     torch_dtype=torch.bfloat16,
    #     attn_implementation="flash_attention_2",
    #     device_map="auto",
    # )

    # default processer
    processor = AutoProcessor.from_pretrained(model_dir)

    # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
    # min_pixels = 256*28*28
    # max_pixels = 1280*28*28
    # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8", min_pixels=min_pixels, max_pixels=max_pixels)

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    
    # print(output_text)
    return output_text


 if __name__ == "__main__":

    if 2 != len(sys.argv):
        print("Usage:\n\t python %s model_dir"%(sys.argv[0]))
        sys.exit(-1)
    
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": "https://raw.githubusercontent.com/ymcui/Chinese-LLaMA-Alpaca-3/refs/heads/main/pics/banner.png",
                },
                {"type": "text", "text": "请描述一下这张图."},
            ],
        }
    ]
    model_dir = sys.argv[1]
    output_text = main(model_dir, messages)
    print(output_text)
diff --git a/quant_qwenvl_gptqmodel.py b/quant_qwenvl_gptqmodel.py
 import os
 import sys
 import logging
 import torch

 from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
 from transformers import AutoTokenizer

 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

 # Set up logging
 logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S",
 )

 class QwenVLQuant:

    def __init__(self, model_path, bits):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
        
        self.quantize_config = QuantizeConfig(
                bits=bits,  # quantize model to 4-bit
                group_size=128,  # it is recommended to set the value to 128
            )
            
        # load un-quantized model, by default, the model will always be loaded into CPU memory
        self.model = GPTQModel.load(model_path, self.quantize_config)
            

    @classmethod
    def __load_dataset(cls, label_path):
        # Then you need to prepare your data for calibaration. What you need to do is just put samples into a list,
        # each of which is a typical chat message as shown below. you can specify text and image in `content` field:
        # dataset = [
        #     # message 0
        #     [
        #         {"role": "system", "content": "You are a helpful assistant."},
        #         {"role": "user", "content": "Tell me who you are."},
        #         {"role": "assistant", "content": "I am a large language model named Qwen..."},
        #     ],
        #     # message 1
        #     [
        #         {
        #             "role": "user",
        #             "content": [
        #                 {"type": "image", "image": "file:///path/to/your/image.jpg"},
        #                 {"type": "text", "text": "Output all text in the image"},
        #             ],
        #         },
        #         {"role": "assistant", "content": "The text in the image is balabala..."},
        #     ],
        #     # other messages...
        #     ...,
        # ]
        # here, we use a caption dataset **only for demonstration**. You should replace it with your own sft dataset.    
    
        with open(label_path, "r") as fpR:
            lines = fpR.readlines()
        dataset = []
        img_root = os.path.dirname(label_path)
        for line in lines:
            line = line.split(",")
            img_path = os.path.join(img_root, line[0])
            if not os.path.exists(img_path):
                continue
            tmp = {}
            tmp["url"] = img_path
            tmp["caption"] = line[1]
            dataset.append(tmp)
        '''
        dataset:
           [
             {"url":IMG_PATH1, "caption":CAPTION1},
             {"url":IMG_PATH2, "caption":CAPTION2},
             {"url":IMG_PATH3, "caption":CAPTION3},
             ......
           ]
        '''
        return dataset

    @classmethod
    def __prepare_dataset(cls, label_path, n_sample=512) -> list[list[dict]]:
        dataset = QwenVLQuant.__load_dataset(label_path)
        sample_num = min(n_sample, len(dataset))
        dataset = dataset[:sample_num]
        return [
            [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": sample["url"]},
                        {"type": "text", "text": "generate a caption for this image"},
                    ],
                },
                {"role": "assistant", "content": sample["caption"]},
            ]
            for sample in dataset
        ]


    def quant_model(self, label_path):
        calibration_dataset = QwenVLQuant.__prepare_dataset(label_path)
        print("dataset num : ", len(calibration_dataset))
        # quantize model, the calibration_dataset should be list of dict whose keys can only be "input_ids" and "attention_mask"
        self.model.quantize(calibration_dataset)
        
        
    def save_quanted_model(self, quant_path):
        # Finally, save the quantized model:
        self.model.save(quant_path)

        # push quantized model to Hugging Face Hub.
        # to use use_auth_token=True, Login first via huggingface-cli login.
        # or pass explcit token with: use_auth_token="hf_xxxxxxx"
        # (uncomment the following three lines to enable this feature)
        # repo_id = f"YourUserName/{quantized_model_dir}"
        # commit_message = f"GPTQModel model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
        # model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)

        # alternatively you can save and push at the same time
        # (uncomment the following three lines to enable this feature)
        # repo_id = f"YourUserName/{quantized_model_dir}"
        # commit_message = f"GPTQModel model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
        # model.push_to_hub(repo_id, save_dir=quantized_model_dir, commit_message=commit_message, use_auth_token=True)

        # save quantized model using safetensors
        # model.save(quant_path)

        # # load quantized model to the first GPU
        # device = get_best_device()
        # model = GPTQModel.load(quant_path, device=device)

        # # load quantized model to CPU with IPEX kernel linear.
        # # model = GPTQModel.from_quantized(quantized_model_dir, device="cpu")

        # # download quantized model from Hugging Face Hub and load to the first GPU
        # # model = GPTQModel.from_quantized(repo_id, device="cuda:0",)

        # # inference with model.generate
        # print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]))

 if __name__ == "__main__":
    if 4 != len(sys.argv):
        print("Usage:\n\t python %s model_path bits label_path"%(sys.argv[0]))
        sys.exit(-1)

    model_path = sys.argv[1]
    bits = int(sys.argv[2])
    label_path = sys.argv[3]
    quant_obj = QwenVLQuant(model_path, bits)
    quant_obj.quant_model(label_path)
    
    quant_path = model_path + "-GPTQModel-jnulzl-int%d"%(bits)
    quant_obj.save_quanted_model(quant_path)
    
diff --git a/Qwen2.5-VL GPTQ.md b/Qwen2.5-VL GPTQ.md
diff --git a/qwen2_5_vl.py b/qwen2_5_vl.py
 # Copyright 2024-2025 ModelCloud.ai
 # Copyright 2024-2025 [email protected]
 # Contact: [email protected], x.com/qubitium
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os.path
 import shutil
 from typing import Dict, Optional

 from PIL import Image
 from transformers import AutoModelForVision2Seq, AutoProcessor, AutoTokenizer

 from ...utils.calibration import batched
 from ...utils.image import extract_vision_info, fetch_image
 from ...utils.model import MODALITY, move_to
 from .._const import CPU
 from ..base import BaseGPTQModel


 class Qwen2_5_VLGPTQ(BaseGPTQModel):
    loader = AutoModelForVision2Seq

    base_modules = ["model.embed_tokens", "model.norm"]
    pre_lm_head_norm_module = "model.norm"

    layers_node = "model.layers"
    layer_type = "Qwen2_5_VLDecoderLayer"
    layer_modules = [
        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
        ["self_attn.o_proj"],
        ["mlp.up_proj", "mlp.gate_proj"],
        ["mlp.down_proj"],
    ]

    modality = [MODALITY.TEXT, MODALITY.IMAGE_TO_TEXT]

    require_load_processor = True

    quant_override_files = {
        "preprocessor_config.json": {
            "do_convert_rgb": True,
            "do_normalize": True,
            "do_rescale": True,
            "do_resize": True,
            "image_mean": [
                0.48145466,
                0.4578275,
                0.40821073
            ],
            "image_processor_type": "Qwen2VLImageProcessor",
            "image_std": [
                0.26862954,
                0.26130258,
                0.27577711
            ],
            "max_pixels": 12845056,
            "merge_size": 2,
            "min_pixels": 3136,
            "patch_size": 14,
            "processor_class": "Qwen2_5_VLProcessor",
            "resample": 3,
            "rescale_factor": 0.00392156862745098,
            "size": {
                "max_pixels": 1003520,
                "min_pixels": 3136
            },
            "temporal_patch_size": 2,
            "vision_token_id": 151654
        }
    }

    def pre_quantize_generate_hook_start(self):
        self.model.visual = move_to(self.model.visual, device=self.quantize_config.device)

    def pre_quantize_generate_hook_end(self):
        self.model.visual = move_to(self.model.visual, device=CPU)

    @staticmethod
    def process_vision_info(
            conversations: list[dict] | list[list[dict]],
    ) -> Optional[list[Image.Image]]:
        vision_infos = extract_vision_info(conversations)
        # Read images
        image_inputs = []
        for vision_info in vision_infos:
            if "image" in vision_info or "image_url" in vision_info:
                image_inputs.append(fetch_image(vision_info))
            else:
                raise ValueError("image, image_url should in content.")
        if len(image_inputs) == 0:
            image_inputs = None
        return image_inputs

    def preprocess_dataset(self, sample: Dict) -> Dict:
        return sample

    def prepare_dataset(
            self,
            calibration_dataset,
            calibration_dataset_concat_size,
            batch_size: int = 1,
            tokenizer=None, ):
        import json
        import tempfile

        if tokenizer is None:
            tokenizer = AutoTokenizer.from_pretrained(self.model_local_path)

        with tempfile.TemporaryDirectory() as tmp_dir:
            chat_template_file = os.path.join(self.model_local_path, "chat_template.json")
            if os.path.exists(chat_template_file):
                shutil.copyfile(chat_template_file, os.path.join(tmp_dir, "chat_template.json"))
            tokenizer.save_pretrained(tmp_dir)
            with open(os.path.join(tmp_dir, "preprocessor_config.json"), "w") as f:
                f.write(json.dumps(self.quant_override_files["preprocessor_config.json"]))
            processor = AutoProcessor.from_pretrained(tmp_dir)
        calib_data = []
        for batch in batched(calibration_dataset, batch_size, process_func=self.preprocess_dataset):
            text = processor.apply_chat_template(
                batch, tokenize=False, add_generation_prompt=True
            )
            image_inputs = self.process_vision_info(batch)
            inputs = processor(
                text=text,
                images=image_inputs,
                videos=None,
                padding=True,
                return_tensors="pt",
            )
            calib_data.append(inputs)
        del processor
        return calib_data
	import os
	import sys
	from transformers import AutoTokenizer, AutoProcessor
	from qwen_vl_utils import process_vision_info

	def main(model_dir, messages):
	if not os.path.exists(model_dir):
	raise Exception("%s directory don't exists!"%(model_dir))

	if "Qwen2-VL" in model_dir:
	from transformers import Qwen2VLForConditionalGeneration as QwenVLForConditionalGeneration
	elif "Qwen2.5-VL" in model_dir:
	from transformers import Qwen2_5_VLForConditionalGeneration as QwenVLForConditionalGeneration

	print(QwenVLForConditionalGeneration)
	# default: Load the model on the available device(s)
	model = QwenVLForConditionalGeneration.from_pretrained(
	model_dir,
	attn_implementation="flash_attention_2",
	device_map="auto"
	)

	# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
	# model = QwenVLForConditionalGeneration.from_pretrained(
	# model_dir,
	# torch_dtype=torch.bfloat16,
	# attn_implementation="flash_attention_2",
	# device_map="auto",
	# )

	# default processer
	processor = AutoProcessor.from_pretrained(model_dir)

	# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
	# min_pixels = 2562828
	# max_pixels = 12802828
	# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8", min_pixels=min_pixels, max_pixels=max_pixels)

	# Preparation for inference
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	).to("cuda")

	# Inference: Generation of the output
	generated_ids = model.generate(**inputs, max_new_tokens=128)
	generated_ids_trimmed = [
	out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)

	# print(output_text)
	return output_text


	if __name__ == "__main__":

	if 2 != len(sys.argv):
	print("Usage:\n\t python %s model_dir"%(sys.argv[0]))
	sys.exit(-1)

	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": "https://raw.githubusercontent.com/ymcui/Chinese-LLaMA-Alpaca-3/refs/heads/main/pics/banner.png",
	},
	{"type": "text", "text": "请描述一下这张图."},
	],
	}
	]
	model_dir = sys.argv[1]
	output_text = main(model_dir, messages)
	print(output_text)
	import os
	import sys
	import logging
	import torch

	from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
	from transformers import AutoTokenizer

	os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

	# Set up logging
	logging.basicConfig(
	format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
	level=logging.INFO,
	datefmt="%Y-%m-%d %H:%M:%S",
	)

	class QwenVLQuant:

	def __init__(self, model_path, bits):
	self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

	self.quantize_config = QuantizeConfig(
	bits=bits, # quantize model to 4-bit
	group_size=128, # it is recommended to set the value to 128
	)

	# load un-quantized model, by default, the model will always be loaded into CPU memory
	self.model = GPTQModel.load(model_path, self.quantize_config)


	@classmethod
	def __load_dataset(cls, label_path):
	# Then you need to prepare your data for calibaration. What you need to do is just put samples into a list,
	# each of which is a typical chat message as shown below. you can specify text and image in `content` field:
	# dataset = [
	# # message 0
	# [
	# {"role": "system", "content": "You are a helpful assistant."},
	# {"role": "user", "content": "Tell me who you are."},
	# {"role": "assistant", "content": "I am a large language model named Qwen..."},
	# ],
	# # message 1
	# [
	# {
	# "role": "user",
	# "content": [
	# {"type": "image", "image": "file:///path/to/your/image.jpg"},
	# {"type": "text", "text": "Output all text in the image"},
	# ],
	# },
	# {"role": "assistant", "content": "The text in the image is balabala..."},
	# ],
	# # other messages...
	# ...,
	# ]
	# here, we use a caption dataset only for demonstration. You should replace it with your own sft dataset.

	with open(label_path, "r") as fpR:
	lines = fpR.readlines()
	dataset = []
	img_root = os.path.dirname(label_path)
	for line in lines:
	line = line.split(",")
	img_path = os.path.join(img_root, line[0])
	if not os.path.exists(img_path):
	continue
	tmp = {}
	tmp["url"] = img_path
	tmp["caption"] = line[1]
	dataset.append(tmp)
	'''
	dataset:
	[
	{"url":IMG_PATH1, "caption":CAPTION1},
	{"url":IMG_PATH2, "caption":CAPTION2},
	{"url":IMG_PATH3, "caption":CAPTION3},
	......
	]
	'''
	return dataset

	@classmethod
	def __prepare_dataset(cls, label_path, n_sample=512) -> list[list[dict]]:
	dataset = QwenVLQuant.__load_dataset(label_path)
	sample_num = min(n_sample, len(dataset))
	dataset = dataset[:sample_num]
	return [
	[
	{
	"role": "user",
	"content": [
	{"type": "image", "image": sample["url"]},
	{"type": "text", "text": "generate a caption for this image"},
	],
	},
	{"role": "assistant", "content": sample["caption"]},
	]
	for sample in dataset
	]


	def quant_model(self, label_path):
	calibration_dataset = QwenVLQuant.__prepare_dataset(label_path)
	print("dataset num : ", len(calibration_dataset))
	# quantize model, the calibration_dataset should be list of dict whose keys can only be "input_ids" and "attention_mask"
	self.model.quantize(calibration_dataset)


	def save_quanted_model(self, quant_path):
	# Finally, save the quantized model:
	self.model.save(quant_path)

	# push quantized model to Hugging Face Hub.
	# to use use_auth_token=True, Login first via huggingface-cli login.
	# or pass explcit token with: use_auth_token="hf_xxxxxxx"
	# (uncomment the following three lines to enable this feature)
	# repo_id = f"YourUserName/{quantized_model_dir}"
	# commit_message = f"GPTQModel model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
	# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)

	# alternatively you can save and push at the same time
	# (uncomment the following three lines to enable this feature)
	# repo_id = f"YourUserName/{quantized_model_dir}"
	# commit_message = f"GPTQModel model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
	# model.push_to_hub(repo_id, save_dir=quantized_model_dir, commit_message=commit_message, use_auth_token=True)

	# save quantized model using safetensors
	# model.save(quant_path)

	# # load quantized model to the first GPU
	# device = get_best_device()
	# model = GPTQModel.load(quant_path, device=device)

	# # load quantized model to CPU with IPEX kernel linear.
	# # model = GPTQModel.from_quantized(quantized_model_dir, device="cpu")

	# # download quantized model from Hugging Face Hub and load to the first GPU
	# # model = GPTQModel.from_quantized(repo_id, device="cuda:0",)

	# # inference with model.generate
	# print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]))

	if __name__ == "__main__":
	if 4 != len(sys.argv):
	print("Usage:\n\t python %s model_path bits label_path"%(sys.argv[0]))
	sys.exit(-1)

	model_path = sys.argv[1]
	bits = int(sys.argv[2])
	label_path = sys.argv[3]
	quant_obj = QwenVLQuant(model_path, bits)
	quant_obj.quant_model(label_path)

	quant_path = model_path + "-GPTQModel-jnulzl-int%d"%(bits)
	quant_obj.save_quanted_model(quant_path)
	# Copyright 2024-2025 ModelCloud.ai
	# Copyright 2024-2025 [email protected]
	# Contact: [email protected], x.com/qubitium
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import os.path
	import shutil
	from typing import Dict, Optional

	from PIL import Image
	from transformers import AutoModelForVision2Seq, AutoProcessor, AutoTokenizer

	from ...utils.calibration import batched
	from ...utils.image import extract_vision_info, fetch_image
	from ...utils.model import MODALITY, move_to
	from .._const import CPU
	from ..base import BaseGPTQModel


	class Qwen2_5_VLGPTQ(BaseGPTQModel):
	loader = AutoModelForVision2Seq

	base_modules = ["model.embed_tokens", "model.norm"]
	pre_lm_head_norm_module = "model.norm"

	layers_node = "model.layers"
	layer_type = "Qwen2_5_VLDecoderLayer"
	layer_modules = [
	["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
	["self_attn.o_proj"],
	["mlp.up_proj", "mlp.gate_proj"],
	["mlp.down_proj"],
	]

	modality = [MODALITY.TEXT, MODALITY.IMAGE_TO_TEXT]

	require_load_processor = True

	quant_override_files = {
	"preprocessor_config.json": {
	"do_convert_rgb": True,
	"do_normalize": True,
	"do_rescale": True,
	"do_resize": True,
	"image_mean": [
	0.48145466,
	0.4578275,
	0.40821073
	],
	"image_processor_type": "Qwen2VLImageProcessor",
	"image_std": [
	0.26862954,
	0.26130258,
	0.27577711
	],
	"max_pixels": 12845056,
	"merge_size": 2,
	"min_pixels": 3136,
	"patch_size": 14,
	"processor_class": "Qwen2_5_VLProcessor",
	"resample": 3,
	"rescale_factor": 0.00392156862745098,
	"size": {
	"max_pixels": 1003520,
	"min_pixels": 3136
	},
	"temporal_patch_size": 2,
	"vision_token_id": 151654
	}
	}

	def pre_quantize_generate_hook_start(self):
	self.model.visual = move_to(self.model.visual, device=self.quantize_config.device)

	def pre_quantize_generate_hook_end(self):
	self.model.visual = move_to(self.model.visual, device=CPU)

	@staticmethod
	def process_vision_info(
	conversations: list[dict] \| list[list[dict]],
	) -> Optional[list[Image.Image]]:
	vision_infos = extract_vision_info(conversations)
	# Read images
	image_inputs = []
	for vision_info in vision_infos:
	if "image" in vision_info or "image_url" in vision_info:
	image_inputs.append(fetch_image(vision_info))
	else:
	raise ValueError("image, image_url should in content.")
	if len(image_inputs) == 0:
	image_inputs = None
	return image_inputs

	def preprocess_dataset(self, sample: Dict) -> Dict:
	return sample

	def prepare_dataset(
	self,
	calibration_dataset,
	calibration_dataset_concat_size,
	batch_size: int = 1,
	tokenizer=None, ):
	import json
	import tempfile

	if tokenizer is None:
	tokenizer = AutoTokenizer.from_pretrained(self.model_local_path)

	with tempfile.TemporaryDirectory() as tmp_dir:
	chat_template_file = os.path.join(self.model_local_path, "chat_template.json")
	if os.path.exists(chat_template_file):
	shutil.copyfile(chat_template_file, os.path.join(tmp_dir, "chat_template.json"))
	tokenizer.save_pretrained(tmp_dir)
	with open(os.path.join(tmp_dir, "preprocessor_config.json"), "w") as f:
	f.write(json.dumps(self.quant_override_files["preprocessor_config.json"]))
	processor = AutoProcessor.from_pretrained(tmp_dir)
	calib_data = []
	for batch in batched(calibration_dataset, batch_size, process_func=self.preprocess_dataset):
	text = processor.apply_chat_template(
	batch, tokenize=False, add_generation_prompt=True
	)
	image_inputs = self.process_vision_info(batch)
	inputs = processor(
	text=text,
	images=image_inputs,
	videos=None,
	padding=True,
	return_tensors="pt",
	)
	calib_data.append(inputs)
	del processor
	return calib_data