Skip to content

Instantly share code, notes, and snippets.

@androiddrew
Created December 31, 2025 14:52
Show Gist options
  • Select an option

  • Save androiddrew/9e3031d51051a083b8d3f20961886f34 to your computer and use it in GitHub Desktop.

Select an option

Save androiddrew/9e3031d51051a083b8d3f20961886f34 to your computer and use it in GitHub Desktop.
LLM Compressor Devstral2
#!/usr/bin/env python3
"""
Quantize a (multi-modal) Mistral3ForConditionalGeneration model with LLM Compressor
to match a config.json that looks like:
- quant_method: compressed-tensors
- format: pack-quantized
- Linear weights: int4, symmetric, group strategy, group_size=128, observer=mse, dynamic=false
- no activation quantization
- ignore: explicit module names (vision tower, projector, lm_head)
Usage:
python3 devstral2_quantize.py \
--model_id mistralai/Devstral-Small-2-24B-Instruct-2512 \
--save_dir ./androiddrew/Devstral-Small-2-24B-Instruct-2512-W4A16-G128-MSE \
--dataset HuggingFaceH4/ultrachat_200k \
--split "train_sft[:512]" \
--max_seq_len 2048 \
--num_calibration_samples 512
"""
from __future__ import annotations
import argparse
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
# ---- Exact ignore list as in your config.json ----
IGNORE_MODULES = [
"model.vision_tower.transformer.layers.0.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.0.feed_forward.up_proj",
"model.vision_tower.transformer.layers.0.feed_forward.down_proj",
"model.vision_tower.transformer.layers.0.attention.k_proj",
"model.vision_tower.transformer.layers.0.attention.v_proj",
"model.vision_tower.transformer.layers.0.attention.q_proj",
"model.vision_tower.transformer.layers.0.attention.o_proj",
"model.vision_tower.transformer.layers.1.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.1.feed_forward.up_proj",
"model.vision_tower.transformer.layers.1.feed_forward.down_proj",
"model.vision_tower.transformer.layers.1.attention.k_proj",
"model.vision_tower.transformer.layers.1.attention.v_proj",
"model.vision_tower.transformer.layers.1.attention.q_proj",
"model.vision_tower.transformer.layers.1.attention.o_proj",
"model.vision_tower.transformer.layers.2.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.2.feed_forward.up_proj",
"model.vision_tower.transformer.layers.2.feed_forward.down_proj",
"model.vision_tower.transformer.layers.2.attention.k_proj",
"model.vision_tower.transformer.layers.2.attention.v_proj",
"model.vision_tower.transformer.layers.2.attention.q_proj",
"model.vision_tower.transformer.layers.2.attention.o_proj",
"model.vision_tower.transformer.layers.3.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.3.feed_forward.up_proj",
"model.vision_tower.transformer.layers.3.feed_forward.down_proj",
"model.vision_tower.transformer.layers.3.attention.k_proj",
"model.vision_tower.transformer.layers.3.attention.v_proj",
"model.vision_tower.transformer.layers.3.attention.q_proj",
"model.vision_tower.transformer.layers.3.attention.o_proj",
"model.vision_tower.transformer.layers.4.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.4.feed_forward.up_proj",
"model.vision_tower.transformer.layers.4.feed_forward.down_proj",
"model.vision_tower.transformer.layers.4.attention.k_proj",
"model.vision_tower.transformer.layers.4.attention.v_proj",
"model.vision_tower.transformer.layers.4.attention.q_proj",
"model.vision_tower.transformer.layers.4.attention.o_proj",
"model.vision_tower.transformer.layers.5.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.5.feed_forward.up_proj",
"model.vision_tower.transformer.layers.5.feed_forward.down_proj",
"model.vision_tower.transformer.layers.5.attention.k_proj",
"model.vision_tower.transformer.layers.5.attention.v_proj",
"model.vision_tower.transformer.layers.5.attention.q_proj",
"model.vision_tower.transformer.layers.5.attention.o_proj",
"model.vision_tower.transformer.layers.6.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.6.feed_forward.up_proj",
"model.vision_tower.transformer.layers.6.feed_forward.down_proj",
"model.vision_tower.transformer.layers.6.attention.k_proj",
"model.vision_tower.transformer.layers.6.attention.v_proj",
"model.vision_tower.transformer.layers.6.attention.q_proj",
"model.vision_tower.transformer.layers.6.attention.o_proj",
"model.vision_tower.transformer.layers.7.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.7.feed_forward.up_proj",
"model.vision_tower.transformer.layers.7.feed_forward.down_proj",
"model.vision_tower.transformer.layers.7.attention.k_proj",
"model.vision_tower.transformer.layers.7.attention.v_proj",
"model.vision_tower.transformer.layers.7.attention.q_proj",
"model.vision_tower.transformer.layers.7.attention.o_proj",
"model.vision_tower.transformer.layers.8.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.8.feed_forward.up_proj",
"model.vision_tower.transformer.layers.8.feed_forward.down_proj",
"model.vision_tower.transformer.layers.8.attention.k_proj",
"model.vision_tower.transformer.layers.8.attention.v_proj",
"model.vision_tower.transformer.layers.8.attention.q_proj",
"model.vision_tower.transformer.layers.8.attention.o_proj",
"model.vision_tower.transformer.layers.9.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.9.feed_forward.up_proj",
"model.vision_tower.transformer.layers.9.feed_forward.down_proj",
"model.vision_tower.transformer.layers.9.attention.k_proj",
"model.vision_tower.transformer.layers.9.attention.v_proj",
"model.vision_tower.transformer.layers.9.attention.q_proj",
"model.vision_tower.transformer.layers.9.attention.o_proj",
"model.vision_tower.transformer.layers.10.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.10.feed_forward.up_proj",
"model.vision_tower.transformer.layers.10.feed_forward.down_proj",
"model.vision_tower.transformer.layers.10.attention.k_proj",
"model.vision_tower.transformer.layers.10.attention.v_proj",
"model.vision_tower.transformer.layers.10.attention.q_proj",
"model.vision_tower.transformer.layers.10.attention.o_proj",
"model.vision_tower.transformer.layers.11.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.11.feed_forward.up_proj",
"model.vision_tower.transformer.layers.11.feed_forward.down_proj",
"model.vision_tower.transformer.layers.11.attention.k_proj",
"model.vision_tower.transformer.layers.11.attention.v_proj",
"model.vision_tower.transformer.layers.11.attention.q_proj",
"model.vision_tower.transformer.layers.11.attention.o_proj",
"model.vision_tower.transformer.layers.12.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.12.feed_forward.up_proj",
"model.vision_tower.transformer.layers.12.feed_forward.down_proj",
"model.vision_tower.transformer.layers.12.attention.k_proj",
"model.vision_tower.transformer.layers.12.attention.v_proj",
"model.vision_tower.transformer.layers.12.attention.q_proj",
"model.vision_tower.transformer.layers.12.attention.o_proj",
"model.vision_tower.transformer.layers.13.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.13.feed_forward.up_proj",
"model.vision_tower.transformer.layers.13.feed_forward.down_proj",
"model.vision_tower.transformer.layers.13.attention.k_proj",
"model.vision_tower.transformer.layers.13.attention.v_proj",
"model.vision_tower.transformer.layers.13.attention.q_proj",
"model.vision_tower.transformer.layers.13.attention.o_proj",
"model.vision_tower.transformer.layers.14.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.14.feed_forward.up_proj",
"model.vision_tower.transformer.layers.14.feed_forward.down_proj",
"model.vision_tower.transformer.layers.14.attention.k_proj",
"model.vision_tower.transformer.layers.14.attention.v_proj",
"model.vision_tower.transformer.layers.14.attention.q_proj",
"model.vision_tower.transformer.layers.14.attention.o_proj",
"model.vision_tower.transformer.layers.15.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.15.feed_forward.up_proj",
"model.vision_tower.transformer.layers.15.feed_forward.down_proj",
"model.vision_tower.transformer.layers.15.attention.k_proj",
"model.vision_tower.transformer.layers.15.attention.v_proj",
"model.vision_tower.transformer.layers.15.attention.q_proj",
"model.vision_tower.transformer.layers.15.attention.o_proj",
"model.vision_tower.transformer.layers.16.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.16.feed_forward.up_proj",
"model.vision_tower.transformer.layers.16.feed_forward.down_proj",
"model.vision_tower.transformer.layers.16.attention.k_proj",
"model.vision_tower.transformer.layers.16.attention.v_proj",
"model.vision_tower.transformer.layers.16.attention.q_proj",
"model.vision_tower.transformer.layers.16.attention.o_proj",
"model.vision_tower.transformer.layers.17.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.17.feed_forward.up_proj",
"model.vision_tower.transformer.layers.17.feed_forward.down_proj",
"model.vision_tower.transformer.layers.17.attention.k_proj",
"model.vision_tower.transformer.layers.17.attention.v_proj",
"model.vision_tower.transformer.layers.17.attention.q_proj",
"model.vision_tower.transformer.layers.17.attention.o_proj",
"model.vision_tower.transformer.layers.18.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.18.feed_forward.up_proj",
"model.vision_tower.transformer.layers.18.feed_forward.down_proj",
"model.vision_tower.transformer.layers.18.attention.k_proj",
"model.vision_tower.transformer.layers.18.attention.v_proj",
"model.vision_tower.transformer.layers.18.attention.q_proj",
"model.vision_tower.transformer.layers.18.attention.o_proj",
"model.vision_tower.transformer.layers.19.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.19.feed_forward.up_proj",
"model.vision_tower.transformer.layers.19.feed_forward.down_proj",
"model.vision_tower.transformer.layers.19.attention.k_proj",
"model.vision_tower.transformer.layers.19.attention.v_proj",
"model.vision_tower.transformer.layers.19.attention.q_proj",
"model.vision_tower.transformer.layers.19.attention.o_proj",
"model.vision_tower.transformer.layers.20.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.20.feed_forward.up_proj",
"model.vision_tower.transformer.layers.20.feed_forward.down_proj",
"model.vision_tower.transformer.layers.20.attention.k_proj",
"model.vision_tower.transformer.layers.20.attention.v_proj",
"model.vision_tower.transformer.layers.20.attention.q_proj",
"model.vision_tower.transformer.layers.20.attention.o_proj",
"model.vision_tower.transformer.layers.21.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.21.feed_forward.up_proj",
"model.vision_tower.transformer.layers.21.feed_forward.down_proj",
"model.vision_tower.transformer.layers.21.attention.k_proj",
"model.vision_tower.transformer.layers.21.attention.v_proj",
"model.vision_tower.transformer.layers.21.attention.q_proj",
"model.vision_tower.transformer.layers.21.attention.o_proj",
"model.vision_tower.transformer.layers.22.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.22.feed_forward.up_proj",
"model.vision_tower.transformer.layers.22.feed_forward.down_proj",
"model.vision_tower.transformer.layers.22.attention.k_proj",
"model.vision_tower.transformer.layers.22.attention.v_proj",
"model.vision_tower.transformer.layers.22.attention.q_proj",
"model.vision_tower.transformer.layers.22.attention.o_proj",
"model.vision_tower.transformer.layers.23.feed_forward.gate_proj",
"model.vision_tower.transformer.layers.23.feed_forward.up_proj",
"model.vision_tower.transformer.layers.23.feed_forward.down_proj",
"model.vision_tower.transformer.layers.23.attention.k_proj",
"model.vision_tower.transformer.layers.23.attention.v_proj",
"model.vision_tower.transformer.layers.23.attention.q_proj",
"model.vision_tower.transformer.layers.23.attention.o_proj",
"model.multi_modal_projector.patch_merger.merging_layer",
"model.multi_modal_projector.linear_1",
"model.multi_modal_projector.linear_2",
"lm_head",
]
def build_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser()
p.add_argument("--model_id", required=True, help="HF repo id or local path")
p.add_argument("--save_dir", required=True, help="Output directory")
p.add_argument("--dataset", default="HuggingFaceH4/ultrachat_200k")
p.add_argument("--split", default="train_sft[:512]")
p.add_argument("--max_seq_len", type=int, default=2048)
p.add_argument("--num_calibration_samples", type=int, default=512)
p.add_argument("--seed", type=int, default=42)
return p
def main() -> None:
args = build_parser().parse_args()
# Load model/tokenizer.
# Docs use AutoModelForCausalLM for quantized save/load paths; works for many decoder-only + VLM wrappers.
# If your checkpoint requires a custom class, swap to the specific class import.
model = AutoModelForCausalLM.from_pretrained(args.model_id, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
# Calibration dataset (text-only). For multimodal models, LLM Compressor examples often avoid quantizing
# vision tower; text calibration is typically sufficient for the LM blocks you *are* quantizing.
ds = load_dataset(args.dataset, split=args.split).shuffle(seed=args.seed)
def preprocess(ex):
# Ultrachat uses "messages" -> chat template
return {"text": tokenizer.apply_chat_template(ex["messages"], tokenize=False)}
ds = ds.map(preprocess)
def tokenize(ex):
# chat_template already inserts BOS etc; follow the official W4A16 example guidance.
return tokenizer(
ex["text"],
padding=False,
truncation=True,
max_length=args.max_seq_len,
add_special_tokens=False,
)
ds = ds.map(tokenize, remove_columns=ds.column_names)
# Custom scheme matching your config.json:
# - weights only
# - int4
# - group strategy
# - group_size=32
# - symmetric=true
# - observer=mse
# - packed format
scheme = {
"format": "pack-quantized",
"weights": {
"type": "int",
"num_bits": 4,
"strategy": "group",
"group_size": 128,
"symmetric": True,
"dynamic": False,
"observer": "mse",
"observer_kwargs": {},
"actorder": None,
"block_structure": None,
},
"input_activations": None,
"output_activations": None,
}
recipe = [
GPTQModifier(
targets="Linear",
scheme=scheme,
# For Mistral-family blocks, docs show using MistralDecoderLayer to control activation offloading granularity.
sequential_targets=["MistralDecoderLayer"],
# Match your explicit ignore list so the emitted config.json "ignore" is identical in spirit.
ignore=IGNORE_MODULES,
)
]
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=args.max_seq_len,
num_calibration_samples=args.num_calibration_samples,
)
# Save in "compressed-tensors" compatible form (this is what drives quant_method=compressed-tensors).
model.save_pretrained(args.save_dir, save_compressed=True)
tokenizer.save_pretrained(args.save_dir)
print(f"Saved compressed model to: {args.save_dir}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment