Created
December 31, 2025 14:52
-
-
Save androiddrew/9e3031d51051a083b8d3f20961886f34 to your computer and use it in GitHub Desktop.
LLM Compressor Devstral2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Quantize a (multi-modal) Mistral3ForConditionalGeneration model with LLM Compressor | |
| to match a config.json that looks like: | |
| - quant_method: compressed-tensors | |
| - format: pack-quantized | |
| - Linear weights: int4, symmetric, group strategy, group_size=128, observer=mse, dynamic=false | |
| - no activation quantization | |
| - ignore: explicit module names (vision tower, projector, lm_head) | |
| Usage: | |
| python3 devstral2_quantize.py \ | |
| --model_id mistralai/Devstral-Small-2-24B-Instruct-2512 \ | |
| --save_dir ./androiddrew/Devstral-Small-2-24B-Instruct-2512-W4A16-G128-MSE \ | |
| --dataset HuggingFaceH4/ultrachat_200k \ | |
| --split "train_sft[:512]" \ | |
| --max_seq_len 2048 \ | |
| --num_calibration_samples 512 | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.quantization import GPTQModifier | |
| # ---- Exact ignore list as in your config.json ---- | |
| IGNORE_MODULES = [ | |
| "model.vision_tower.transformer.layers.0.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.0.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.0.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.0.attention.k_proj", | |
| "model.vision_tower.transformer.layers.0.attention.v_proj", | |
| "model.vision_tower.transformer.layers.0.attention.q_proj", | |
| "model.vision_tower.transformer.layers.0.attention.o_proj", | |
| "model.vision_tower.transformer.layers.1.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.1.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.1.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.1.attention.k_proj", | |
| "model.vision_tower.transformer.layers.1.attention.v_proj", | |
| "model.vision_tower.transformer.layers.1.attention.q_proj", | |
| "model.vision_tower.transformer.layers.1.attention.o_proj", | |
| "model.vision_tower.transformer.layers.2.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.2.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.2.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.2.attention.k_proj", | |
| "model.vision_tower.transformer.layers.2.attention.v_proj", | |
| "model.vision_tower.transformer.layers.2.attention.q_proj", | |
| "model.vision_tower.transformer.layers.2.attention.o_proj", | |
| "model.vision_tower.transformer.layers.3.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.3.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.3.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.3.attention.k_proj", | |
| "model.vision_tower.transformer.layers.3.attention.v_proj", | |
| "model.vision_tower.transformer.layers.3.attention.q_proj", | |
| "model.vision_tower.transformer.layers.3.attention.o_proj", | |
| "model.vision_tower.transformer.layers.4.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.4.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.4.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.4.attention.k_proj", | |
| "model.vision_tower.transformer.layers.4.attention.v_proj", | |
| "model.vision_tower.transformer.layers.4.attention.q_proj", | |
| "model.vision_tower.transformer.layers.4.attention.o_proj", | |
| "model.vision_tower.transformer.layers.5.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.5.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.5.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.5.attention.k_proj", | |
| "model.vision_tower.transformer.layers.5.attention.v_proj", | |
| "model.vision_tower.transformer.layers.5.attention.q_proj", | |
| "model.vision_tower.transformer.layers.5.attention.o_proj", | |
| "model.vision_tower.transformer.layers.6.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.6.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.6.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.6.attention.k_proj", | |
| "model.vision_tower.transformer.layers.6.attention.v_proj", | |
| "model.vision_tower.transformer.layers.6.attention.q_proj", | |
| "model.vision_tower.transformer.layers.6.attention.o_proj", | |
| "model.vision_tower.transformer.layers.7.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.7.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.7.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.7.attention.k_proj", | |
| "model.vision_tower.transformer.layers.7.attention.v_proj", | |
| "model.vision_tower.transformer.layers.7.attention.q_proj", | |
| "model.vision_tower.transformer.layers.7.attention.o_proj", | |
| "model.vision_tower.transformer.layers.8.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.8.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.8.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.8.attention.k_proj", | |
| "model.vision_tower.transformer.layers.8.attention.v_proj", | |
| "model.vision_tower.transformer.layers.8.attention.q_proj", | |
| "model.vision_tower.transformer.layers.8.attention.o_proj", | |
| "model.vision_tower.transformer.layers.9.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.9.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.9.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.9.attention.k_proj", | |
| "model.vision_tower.transformer.layers.9.attention.v_proj", | |
| "model.vision_tower.transformer.layers.9.attention.q_proj", | |
| "model.vision_tower.transformer.layers.9.attention.o_proj", | |
| "model.vision_tower.transformer.layers.10.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.10.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.10.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.10.attention.k_proj", | |
| "model.vision_tower.transformer.layers.10.attention.v_proj", | |
| "model.vision_tower.transformer.layers.10.attention.q_proj", | |
| "model.vision_tower.transformer.layers.10.attention.o_proj", | |
| "model.vision_tower.transformer.layers.11.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.11.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.11.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.11.attention.k_proj", | |
| "model.vision_tower.transformer.layers.11.attention.v_proj", | |
| "model.vision_tower.transformer.layers.11.attention.q_proj", | |
| "model.vision_tower.transformer.layers.11.attention.o_proj", | |
| "model.vision_tower.transformer.layers.12.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.12.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.12.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.12.attention.k_proj", | |
| "model.vision_tower.transformer.layers.12.attention.v_proj", | |
| "model.vision_tower.transformer.layers.12.attention.q_proj", | |
| "model.vision_tower.transformer.layers.12.attention.o_proj", | |
| "model.vision_tower.transformer.layers.13.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.13.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.13.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.13.attention.k_proj", | |
| "model.vision_tower.transformer.layers.13.attention.v_proj", | |
| "model.vision_tower.transformer.layers.13.attention.q_proj", | |
| "model.vision_tower.transformer.layers.13.attention.o_proj", | |
| "model.vision_tower.transformer.layers.14.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.14.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.14.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.14.attention.k_proj", | |
| "model.vision_tower.transformer.layers.14.attention.v_proj", | |
| "model.vision_tower.transformer.layers.14.attention.q_proj", | |
| "model.vision_tower.transformer.layers.14.attention.o_proj", | |
| "model.vision_tower.transformer.layers.15.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.15.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.15.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.15.attention.k_proj", | |
| "model.vision_tower.transformer.layers.15.attention.v_proj", | |
| "model.vision_tower.transformer.layers.15.attention.q_proj", | |
| "model.vision_tower.transformer.layers.15.attention.o_proj", | |
| "model.vision_tower.transformer.layers.16.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.16.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.16.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.16.attention.k_proj", | |
| "model.vision_tower.transformer.layers.16.attention.v_proj", | |
| "model.vision_tower.transformer.layers.16.attention.q_proj", | |
| "model.vision_tower.transformer.layers.16.attention.o_proj", | |
| "model.vision_tower.transformer.layers.17.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.17.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.17.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.17.attention.k_proj", | |
| "model.vision_tower.transformer.layers.17.attention.v_proj", | |
| "model.vision_tower.transformer.layers.17.attention.q_proj", | |
| "model.vision_tower.transformer.layers.17.attention.o_proj", | |
| "model.vision_tower.transformer.layers.18.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.18.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.18.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.18.attention.k_proj", | |
| "model.vision_tower.transformer.layers.18.attention.v_proj", | |
| "model.vision_tower.transformer.layers.18.attention.q_proj", | |
| "model.vision_tower.transformer.layers.18.attention.o_proj", | |
| "model.vision_tower.transformer.layers.19.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.19.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.19.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.19.attention.k_proj", | |
| "model.vision_tower.transformer.layers.19.attention.v_proj", | |
| "model.vision_tower.transformer.layers.19.attention.q_proj", | |
| "model.vision_tower.transformer.layers.19.attention.o_proj", | |
| "model.vision_tower.transformer.layers.20.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.20.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.20.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.20.attention.k_proj", | |
| "model.vision_tower.transformer.layers.20.attention.v_proj", | |
| "model.vision_tower.transformer.layers.20.attention.q_proj", | |
| "model.vision_tower.transformer.layers.20.attention.o_proj", | |
| "model.vision_tower.transformer.layers.21.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.21.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.21.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.21.attention.k_proj", | |
| "model.vision_tower.transformer.layers.21.attention.v_proj", | |
| "model.vision_tower.transformer.layers.21.attention.q_proj", | |
| "model.vision_tower.transformer.layers.21.attention.o_proj", | |
| "model.vision_tower.transformer.layers.22.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.22.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.22.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.22.attention.k_proj", | |
| "model.vision_tower.transformer.layers.22.attention.v_proj", | |
| "model.vision_tower.transformer.layers.22.attention.q_proj", | |
| "model.vision_tower.transformer.layers.22.attention.o_proj", | |
| "model.vision_tower.transformer.layers.23.feed_forward.gate_proj", | |
| "model.vision_tower.transformer.layers.23.feed_forward.up_proj", | |
| "model.vision_tower.transformer.layers.23.feed_forward.down_proj", | |
| "model.vision_tower.transformer.layers.23.attention.k_proj", | |
| "model.vision_tower.transformer.layers.23.attention.v_proj", | |
| "model.vision_tower.transformer.layers.23.attention.q_proj", | |
| "model.vision_tower.transformer.layers.23.attention.o_proj", | |
| "model.multi_modal_projector.patch_merger.merging_layer", | |
| "model.multi_modal_projector.linear_1", | |
| "model.multi_modal_projector.linear_2", | |
| "lm_head", | |
| ] | |
| def build_parser() -> argparse.ArgumentParser: | |
| p = argparse.ArgumentParser() | |
| p.add_argument("--model_id", required=True, help="HF repo id or local path") | |
| p.add_argument("--save_dir", required=True, help="Output directory") | |
| p.add_argument("--dataset", default="HuggingFaceH4/ultrachat_200k") | |
| p.add_argument("--split", default="train_sft[:512]") | |
| p.add_argument("--max_seq_len", type=int, default=2048) | |
| p.add_argument("--num_calibration_samples", type=int, default=512) | |
| p.add_argument("--seed", type=int, default=42) | |
| return p | |
| def main() -> None: | |
| args = build_parser().parse_args() | |
| # Load model/tokenizer. | |
| # Docs use AutoModelForCausalLM for quantized save/load paths; works for many decoder-only + VLM wrappers. | |
| # If your checkpoint requires a custom class, swap to the specific class import. | |
| model = AutoModelForCausalLM.from_pretrained(args.model_id, dtype="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(args.model_id) | |
| # Calibration dataset (text-only). For multimodal models, LLM Compressor examples often avoid quantizing | |
| # vision tower; text calibration is typically sufficient for the LM blocks you *are* quantizing. | |
| ds = load_dataset(args.dataset, split=args.split).shuffle(seed=args.seed) | |
| def preprocess(ex): | |
| # Ultrachat uses "messages" -> chat template | |
| return {"text": tokenizer.apply_chat_template(ex["messages"], tokenize=False)} | |
| ds = ds.map(preprocess) | |
| def tokenize(ex): | |
| # chat_template already inserts BOS etc; follow the official W4A16 example guidance. | |
| return tokenizer( | |
| ex["text"], | |
| padding=False, | |
| truncation=True, | |
| max_length=args.max_seq_len, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| # Custom scheme matching your config.json: | |
| # - weights only | |
| # - int4 | |
| # - group strategy | |
| # - group_size=32 | |
| # - symmetric=true | |
| # - observer=mse | |
| # - packed format | |
| scheme = { | |
| "format": "pack-quantized", | |
| "weights": { | |
| "type": "int", | |
| "num_bits": 4, | |
| "strategy": "group", | |
| "group_size": 128, | |
| "symmetric": True, | |
| "dynamic": False, | |
| "observer": "mse", | |
| "observer_kwargs": {}, | |
| "actorder": None, | |
| "block_structure": None, | |
| }, | |
| "input_activations": None, | |
| "output_activations": None, | |
| } | |
| recipe = [ | |
| GPTQModifier( | |
| targets="Linear", | |
| scheme=scheme, | |
| # For Mistral-family blocks, docs show using MistralDecoderLayer to control activation offloading granularity. | |
| sequential_targets=["MistralDecoderLayer"], | |
| # Match your explicit ignore list so the emitted config.json "ignore" is identical in spirit. | |
| ignore=IGNORE_MODULES, | |
| ) | |
| ] | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=args.max_seq_len, | |
| num_calibration_samples=args.num_calibration_samples, | |
| ) | |
| # Save in "compressed-tensors" compatible form (this is what drives quant_method=compressed-tensors). | |
| model.save_pretrained(args.save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(args.save_dir) | |
| print(f"Saved compressed model to: {args.save_dir}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment