Created
April 22, 2026 18:54
-
-
Save HDCharles/8bbfecb47f72db8d0279dc741f19defa to your computer and use it in GitHub Desktop.
GPTQ actorder regression test suite for llm-compressor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Extract summary data from AWQ DDP regression test log files. | |
| Parses the log output from run_all_tests.sh and produces a comparison table | |
| showing pre-DDP vs post-DDP results across models, schemes, and benchmarks. | |
| Usage: | |
| python extract_log_summary.py regression_results.log | |
| """ | |
| import re | |
| import sys | |
| from collections import defaultdict | |
| def extract_log_summary(log_path): | |
| with open(log_path, "r") as f: | |
| content = f.read() | |
| # Split into sections by the box-drawing delimiters | |
| section_pattern = re.compile( | |
| r"║\s+MODEL:\s+(.+?)\n" | |
| r"\s*║\s+SCHEME:\s+(.+?)\n" | |
| r"\s*║\s+CODE STATE:\s+(.+?)\n" | |
| r".*?" | |
| r"(?=║\s+MODEL:|╔══.*FINAL SUMMARY|\Z)", | |
| re.DOTALL, | |
| ) | |
| sections = section_pattern.findall(content) | |
| # results[model][scheme][code_state] = {task: {metric: value}} | |
| results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) | |
| timing = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) | |
| for model, scheme, code_state in sections: | |
| model = model.strip() | |
| scheme = scheme.strip() | |
| code_state = code_state.strip() | |
| # Find the body for this section | |
| pattern = re.escape(f"MODEL: {model}") + r".*?" + re.escape(f"CODE STATE: {code_state}") | |
| match = re.search(pattern, content, re.DOTALL) | |
| if not match: | |
| continue | |
| # Get everything after the match until the next section | |
| start = match.end() | |
| next_section = re.search(r"║\s+MODEL:", content[start:]) | |
| end = start + next_section.start() if next_section else len(content) | |
| body = content[start:end] | |
| # Extract timing | |
| time_match = re.search( | |
| r"Time:\s*([\d.]+)\s*minutes\s*\(([\d.]+)\s*seconds\)", body | |
| ) | |
| if time_match: | |
| timing[model][scheme][code_state]["time_min"] = float( | |
| time_match.group(1) | |
| ) | |
| gpu_match = re.search(r"Peak GPU Memory:\s*([\d.]+)\s*GB", body) | |
| if gpu_match: | |
| timing[model][scheme][code_state]["gpu_gb"] = float(gpu_match.group(1)) | |
| # Extract GSM8K strict-match | |
| strict_main_match = re.search( | |
| r"strict-match\s*\|.*?\|.*?\|.*?\|([\d.]+)\|", body | |
| ) | |
| if strict_main_match: | |
| results[model][scheme][code_state]["gsm8k_strict"] = float( | |
| strict_main_match.group(1) | |
| ) | |
| # Extract wikitext word_perplexity | |
| ppl_match = re.search( | |
| r"word_perplexity\s*\|.*?\|.*?\|.*?\|([\d.]+)\|", body | |
| ) | |
| if ppl_match: | |
| results[model][scheme][code_state]["wikitext_ppl"] = float( | |
| ppl_match.group(1) | |
| ) | |
| # Extract MMLU accuracy | |
| mmlu_match = re.search( | |
| r"acc\s*\|.*?\|.*?\|.*?\|([\d.]+)\|", body | |
| ) | |
| if mmlu_match: | |
| results[model][scheme][code_state]["mmlu_acc"] = float( | |
| mmlu_match.group(1) | |
| ) | |
| # Print comparison table | |
| print(f"\nLog: {log_path}\n") | |
| metrics = ["gsm8k_strict", "wikitext_ppl", "mmlu_acc"] | |
| metric_labels = ["GSM8K Strict", "Wiki PPL", "MMLU Acc"] | |
| header = ( | |
| f"{'Model':<35} {'Scheme':<12} {'State':<10} " | |
| f"{'Time':>7} {'GPU':>6} " | |
| + " ".join(f"{m:>12}" for m in metric_labels) | |
| ) | |
| print(header) | |
| print("-" * len(header)) | |
| for model in sorted(results.keys()): | |
| for scheme in sorted(results[model].keys()): | |
| for code_state in ["pre-ddp", "post-ddp"]: | |
| r = results[model][scheme].get(code_state, {}) | |
| t = timing[model][scheme].get(code_state, {}) | |
| time_str = ( | |
| f"{t['time_min']:.1f}m" if "time_min" in t else "N/A" | |
| ) | |
| gpu_str = ( | |
| f"{t['gpu_gb']:.1f}G" if "gpu_gb" in t else "N/A" | |
| ) | |
| vals = [] | |
| for m in metrics: | |
| if m in r: | |
| vals.append(f"{r[m]:.4f}") | |
| else: | |
| vals.append("N/A") | |
| print( | |
| f"{model:<35} {scheme:<12} {code_state:<10} " | |
| f"{time_str:>7} {gpu_str:>6} " | |
| + " ".join(f"{v:>12}" for v in vals) | |
| ) | |
| # Print delta row | |
| pre = results[model][scheme].get("pre-ddp", {}) | |
| post = results[model][scheme].get("post-ddp", {}) | |
| deltas = [] | |
| for m in metrics: | |
| if m in pre and m in post: | |
| diff = post[m] - pre[m] | |
| sign = "+" if diff >= 0 else "" | |
| # For perplexity, lower is better so flip the sign indicator | |
| if m == "wikitext_ppl": | |
| indicator = " *" if diff > 0 else "" | |
| else: | |
| indicator = " *" if diff < -0.01 else "" | |
| deltas.append(f"{sign}{diff:.4f}{indicator}") | |
| else: | |
| deltas.append("---") | |
| print( | |
| f"{'':35} {'':12} {'delta':<10} " | |
| f"{'':>7} {'':>6} " | |
| + " ".join(f"{d:>12}" for d in deltas) | |
| ) | |
| print() | |
| if __name__ == "__main__": | |
| log_path = sys.argv[1] if len(sys.argv) > 1 else "regression_results.log" | |
| extract_log_summary(log_path) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.awq import AWQModifier | |
| MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| NUM_CALIBRATION_SAMPLES = 256 | |
| MAX_SEQUENCE_LENGTH = 512 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--scheme", default="W4A16_ASYM") | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| recipe = [ | |
| AWQModifier( | |
| ignore=["lm_head"], | |
| scheme=args.scheme, | |
| targets=["Linear"], | |
| duo_scaling="both", | |
| ), | |
| ] | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( | |
| model.device | |
| ) | |
| output = model.generate(input_ids, max_new_tokens=100) | |
| print(tokenizer.decode(output[0])) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.gptq import GPTQModifier | |
| MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| NUM_CALIBRATION_SAMPLES = 256 | |
| MAX_SEQUENCE_LENGTH = 512 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--actorder", default=None) | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| gptq_kwargs = dict( | |
| ignore=["lm_head"], | |
| scheme="FP8_BLOCK", | |
| targets=["Linear"], | |
| ) | |
| if args.actorder: | |
| gptq_kwargs["actorder"] = args.actorder | |
| recipe = [GPTQModifier(**gptq_kwargs)] | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( | |
| model.device | |
| ) | |
| output = model.generate(input_ids, max_new_tokens=100) | |
| print(tokenizer.decode(output[0])) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] | |
| + f"-FP8-BLOCK-GPTQ-{args.actorder or 'no-actorder'}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.gptq import GPTQModifier | |
| MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| NUM_CALIBRATION_SAMPLES = 256 | |
| MAX_SEQUENCE_LENGTH = 512 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--actorder", default=None) | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| gptq_kwargs = dict( | |
| ignore=["lm_head"], | |
| scheme="W4A16", | |
| targets=["Linear"], | |
| ) | |
| if args.actorder: | |
| gptq_kwargs["actorder"] = args.actorder | |
| recipe = [GPTQModifier(**gptq_kwargs)] | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( | |
| model.device | |
| ) | |
| output = model.generate(input_ids, max_new_tokens=100) | |
| print(tokenizer.decode(output[0])) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] | |
| + f"-W4A16-GPTQ-{args.actorder or 'no-actorder'}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.gptq import GPTQModifier | |
| MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| NUM_CALIBRATION_SAMPLES = 256 | |
| MAX_SEQUENCE_LENGTH = 512 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--actorder", default=None) | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| gptq_kwargs = dict( | |
| ignore=["lm_head"], | |
| scheme="W8A16", | |
| targets=["Linear"], | |
| ) | |
| if args.actorder: | |
| gptq_kwargs["actorder"] = args.actorder | |
| recipe = [GPTQModifier(**gptq_kwargs)] | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( | |
| model.device | |
| ) | |
| output = model.generate(input_ids, max_new_tokens=100) | |
| print(tokenizer.decode(output[0])) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] | |
| + f"-W8A16-GPTQ-{args.actorder or 'no-actorder'}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from datasets import load_dataset | |
| from transformers import Llama4ForConditionalGeneration, Llama4Processor | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.awq import AWQModifier | |
| from llmcompressor.modifiers.awq.mappings import AWQMapping | |
| MODEL_ID = "meta-llama/Llama-4-Scout-17B-16E-Instruct" | |
| DATASET_ID = "neuralmagic/calibration" | |
| NUM_CALIBRATION_SAMPLES = 512 | |
| MAX_SEQUENCE_LENGTH = 8192 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--scheme", default="W4A16_ASYM") | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = Llama4ForConditionalGeneration.from_pretrained(MODEL_ID, dtype="auto") | |
| processor = Llama4Processor.from_pretrained(MODEL_ID) | |
| ds = load_dataset( | |
| DATASET_ID, name="LLM", split=f"train[:{num_samples}]" | |
| ) | |
| def preprocess_function(example): | |
| messages = [] | |
| for message in example["messages"]: | |
| messages.append( | |
| { | |
| "role": message["role"], | |
| "content": [{"type": "text", "text": message["content"]}], | |
| } | |
| ) | |
| return processor.apply_chat_template( | |
| messages, | |
| return_tensors="pt", | |
| padding=False, | |
| truncation=True, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| tokenize=True, | |
| add_special_tokens=False, | |
| return_dict=True, | |
| add_generation_prompt=False, | |
| ) | |
| ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names) | |
| def data_collator(batch): | |
| assert len(batch) == 1 | |
| return { | |
| key: ( | |
| torch.tensor(value) | |
| if key != "pixel_values" | |
| else torch.tensor(value, dtype=torch.bfloat16).squeeze(0) | |
| ) | |
| for key, value in batch[0].items() | |
| } | |
| # Llama-4-Scout has both vision_model and language_model sub-models, | |
| # so mappings must be scoped to language_model to avoid dual matches. | |
| # The main experts use a fused gate_up_proj (not Linear), so only | |
| # shared_expert Linear layers are AWQ targets. | |
| recipe = AWQModifier( | |
| targets="Linear", | |
| scheme=args.scheme, | |
| ignore=[ | |
| "re:.*lm_head", | |
| "re:.*self_attn", | |
| "re:.*router", | |
| "re:.*vision_model.*", | |
| "re:.*multi_modal_projector.*", | |
| "Llama4TextAttention", | |
| ], | |
| mappings=[ | |
| AWQMapping( | |
| "re:.*language_model.*post_attention_layernorm$", | |
| [ | |
| "re:.*shared_expert.gate_proj$", | |
| "re:.*shared_expert.up_proj$", | |
| ], | |
| ), | |
| AWQMapping( | |
| "re:.*shared_expert.up_proj$", | |
| ["re:.*shared_expert.down_proj$"], | |
| ), | |
| ], | |
| ) | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| data_collator=data_collator, | |
| sequential_targets=["Llama4TextMLP"], | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| processor.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from datasets import load_dataset | |
| from transformers import Llama4ForConditionalGeneration, Llama4Processor | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.gptq import GPTQModifier | |
| MODEL_ID = "meta-llama/Llama-4-Scout-17B-16E-Instruct" | |
| DATASET_ID = "neuralmagic/calibration" | |
| NUM_CALIBRATION_SAMPLES = 512 | |
| MAX_SEQUENCE_LENGTH = 8192 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--actorder", default=None) | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = Llama4ForConditionalGeneration.from_pretrained(MODEL_ID, dtype="auto") | |
| processor = Llama4Processor.from_pretrained(MODEL_ID) | |
| ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{num_samples}]") | |
| def preprocess_function(example): | |
| messages = [] | |
| for message in example["messages"]: | |
| messages.append( | |
| { | |
| "role": message["role"], | |
| "content": [{"type": "text", "text": message["content"]}], | |
| } | |
| ) | |
| return processor.apply_chat_template( | |
| messages, | |
| return_tensors="pt", | |
| padding=False, | |
| truncation=True, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| tokenize=True, | |
| add_special_tokens=False, | |
| return_dict=True, | |
| add_generation_prompt=False, | |
| ) | |
| ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names) | |
| def data_collator(batch): | |
| assert len(batch) == 1 | |
| return { | |
| key: ( | |
| torch.tensor(value) | |
| if key != "pixel_values" | |
| else torch.tensor(value, dtype=torch.bfloat16).squeeze(0) | |
| ) | |
| for key, value in batch[0].items() | |
| } | |
| gptq_kwargs = dict( | |
| targets="Linear", | |
| ignore=[ | |
| "re:.*lm_head", | |
| "re:.*self_attn", | |
| "re:.*router", | |
| "re:.*vision_model.*", | |
| "re:.*multi_modal_projector.*", | |
| "Llama4TextAttention", | |
| ], | |
| scheme="FP8_BLOCK", | |
| ) | |
| if args.actorder: | |
| gptq_kwargs["actorder"] = args.actorder | |
| recipe = GPTQModifier(**gptq_kwargs) | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| data_collator=data_collator, | |
| sequential_targets=["Llama4TextMLP"], | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] | |
| + f"-FP8-BLOCK-GPTQ-{args.actorder or 'no-actorder'}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| processor.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.awq import AWQModifier | |
| from llmcompressor.modifiers.awq.mappings import AWQMapping | |
| MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| NUM_CALIBRATION_SAMPLES = 256 | |
| MAX_SEQUENCE_LENGTH = 512 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--scheme", default="W4A16_ASYM") | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, dtype=torch.bfloat16, trust_remote_code=True | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| # Mixtral uses w1/w2/w3 naming for expert layers instead of | |
| # gate_proj/up_proj/down_proj, so we need custom mappings | |
| recipe = [ | |
| AWQModifier( | |
| ignore=[ | |
| "lm_head", | |
| "re:.*block_sparse_moe.gate", | |
| ], | |
| scheme=args.scheme, | |
| targets=["Linear"], | |
| duo_scaling="both", | |
| mappings=[ | |
| AWQMapping( | |
| "re:.*input_layernorm$", | |
| ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"], | |
| ), | |
| AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]), | |
| AWQMapping( | |
| "re:.*post_attention_layernorm$", | |
| [ | |
| "re:.*block_sparse_moe.experts.*.w1$", | |
| "re:.*block_sparse_moe.experts.*.w3$", | |
| ], | |
| ), | |
| AWQMapping("re:.*w3$", ["re:.*w2$"]), | |
| ], | |
| ), | |
| ] | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| trust_remote_code_model=True, | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( | |
| model.device | |
| ) | |
| output = model.generate(input_ids, max_new_tokens=100) | |
| print(tokenizer.decode(output[0])) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.gptq import GPTQModifier | |
| MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| NUM_CALIBRATION_SAMPLES = 256 | |
| MAX_SEQUENCE_LENGTH = 512 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--actorder", default=None) | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, dtype=torch.bfloat16, trust_remote_code=True | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| gptq_kwargs = dict( | |
| ignore=["lm_head", "re:.*block_sparse_moe.gate"], | |
| scheme="FP8_BLOCK", | |
| targets=["Linear"], | |
| ) | |
| if args.actorder: | |
| gptq_kwargs["actorder"] = args.actorder | |
| recipe = [GPTQModifier(**gptq_kwargs)] | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| trust_remote_code_model=True, | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( | |
| model.device | |
| ) | |
| output = model.generate(input_ids, max_new_tokens=100) | |
| print(tokenizer.decode(output[0])) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] | |
| + f"-FP8-BLOCK-GPTQ-{args.actorder or 'no-actorder'}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.awq import AWQModifier | |
| MODEL_ID = "Qwen/Qwen2.5-32B-Instruct" | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| NUM_CALIBRATION_SAMPLES = 256 | |
| MAX_SEQUENCE_LENGTH = 512 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--scheme", default="W4A16_ASYM") | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| recipe = [ | |
| AWQModifier( | |
| ignore=["lm_head"], | |
| scheme=args.scheme, | |
| targets=["Linear"], | |
| duo_scaling="both", | |
| ), | |
| ] | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( | |
| model.device | |
| ) | |
| output = model.generate(input_ids, max_new_tokens=100) | |
| print(tokenizer.decode(output[0])) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.gptq import GPTQModifier | |
| MODEL_ID = "Qwen/Qwen2.5-32B-Instruct" | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| NUM_CALIBRATION_SAMPLES = 256 | |
| MAX_SEQUENCE_LENGTH = 512 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--actorder", default=None) | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| gptq_kwargs = dict( | |
| ignore=["lm_head"], | |
| scheme="FP8_BLOCK", | |
| targets=["Linear"], | |
| ) | |
| if args.actorder: | |
| gptq_kwargs["actorder"] = args.actorder | |
| recipe = [GPTQModifier(**gptq_kwargs)] | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( | |
| model.device | |
| ) | |
| output = model.generate(input_ids, max_new_tokens=100) | |
| print(tokenizer.decode(output[0])) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] | |
| + f"-FP8-BLOCK-GPTQ-{args.actorder or 'no-actorder'}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import base64 | |
| import time | |
| from io import BytesIO | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from qwen_vl_utils import process_vision_info | |
| from transformers import AutoProcessor, Qwen3VLForConditionalGeneration | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.awq import AWQModifier | |
| MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct" | |
| DATASET_ID = "lmms-lab/flickr30k" | |
| DATASET_SPLIT = "test" | |
| NUM_CALIBRATION_SAMPLES = 512 | |
| MAX_SEQUENCE_LENGTH = 2048 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--scheme", default="W4A16_ASYM") | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = Qwen3VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID, torch_dtype="auto" | |
| ) | |
| processor = AutoProcessor.from_pretrained(MODEL_ID) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess_and_tokenize(example): | |
| buffered = BytesIO() | |
| example["image"].save(buffered, format="PNG") | |
| encoded_image = base64.b64encode(buffered.getvalue()).decode("utf-8") | |
| base64_qwen = f"data:image;base64,{encoded_image}" | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": base64_qwen}, | |
| {"type": "text", "text": "What does the image show?"}, | |
| ], | |
| } | |
| ] | |
| text = processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| return processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| ) | |
| ds = ds.map(preprocess_and_tokenize, remove_columns=ds.column_names) | |
| def data_collator(batch): | |
| assert len(batch) == 1 | |
| return {key: torch.tensor(value) for key, value in batch[0].items()} | |
| recipe = AWQModifier( | |
| scheme=args.scheme, | |
| ignore=["re:.*lm_head", "re:.*visual.*"], | |
| duo_scaling=False, | |
| ) | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| tokenizer=MODEL_ID, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| data_collator=data_collator, | |
| sequential_targets=["Qwen3VLTextDecoderLayer"], | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image", | |
| "image": "http://images.cocodataset.org/train2017/000000231895.jpg", | |
| }, | |
| {"type": "text", "text": "Please describe the animal in this image\n"}, | |
| ], | |
| } | |
| ] | |
| prompt = processor.apply_chat_template(messages, add_generation_prompt=True) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = processor( | |
| text=[prompt], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| return_tensors="pt", | |
| ).to(model.device) | |
| output = model.generate(**inputs, max_new_tokens=100) | |
| print(processor.decode(output[0], skip_special_tokens=True)) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| processor.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import base64 | |
| import time | |
| from io import BytesIO | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from qwen_vl_utils import process_vision_info | |
| from transformers import AutoProcessor, Qwen3VLForConditionalGeneration | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.gptq import GPTQModifier | |
| MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct" | |
| DATASET_ID = "lmms-lab/flickr30k" | |
| DATASET_SPLIT = "test" | |
| NUM_CALIBRATION_SAMPLES = 512 | |
| MAX_SEQUENCE_LENGTH = 2048 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--actorder", default=None) | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = Qwen3VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID, torch_dtype="auto" | |
| ) | |
| processor = AutoProcessor.from_pretrained(MODEL_ID) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess_and_tokenize(example): | |
| buffered = BytesIO() | |
| example["image"].save(buffered, format="PNG") | |
| encoded_image = base64.b64encode(buffered.getvalue()).decode("utf-8") | |
| base64_qwen = f"data:image;base64,{encoded_image}" | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": base64_qwen}, | |
| {"type": "text", "text": "What does the image show?"}, | |
| ], | |
| } | |
| ] | |
| text = processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| return processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| ) | |
| ds = ds.map(preprocess_and_tokenize, remove_columns=ds.column_names) | |
| def data_collator(batch): | |
| assert len(batch) == 1 | |
| return {key: torch.tensor(value) for key, value in batch[0].items()} | |
| gptq_kwargs = dict( | |
| ignore=["re:.*lm_head", "re:.*visual.*"], | |
| scheme="FP8_BLOCK", | |
| targets=["Linear"], | |
| ) | |
| if args.actorder: | |
| gptq_kwargs["actorder"] = args.actorder | |
| recipe = [GPTQModifier(**gptq_kwargs)] | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| tokenizer=MODEL_ID, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| data_collator=data_collator, | |
| sequential_targets=["Qwen3VLTextDecoderLayer"], | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image", | |
| "image": "http://images.cocodataset.org/train2017/000000231895.jpg", | |
| }, | |
| {"type": "text", "text": "Please describe the animal in this image\n"}, | |
| ], | |
| } | |
| ] | |
| prompt = processor.apply_chat_template(messages, add_generation_prompt=True) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = processor( | |
| text=[prompt], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| return_tensors="pt", | |
| ).to(model.device) | |
| output = model.generate(**inputs, max_new_tokens=100) | |
| print(processor.decode(output[0], skip_special_tokens=True)) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] | |
| + f"-FP8-BLOCK-GPTQ-{args.actorder or 'no-actorder'}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| processor.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # GPTQ actorder Regression Test Suite | |
| # Compares without-actorder vs with-actorder (actorder=weight) for GPTQ | |
| # quantization across models, schemes, and benchmarks. | |
| # | |
| # Usage: | |
| # ./run_all_tests.sh 2>&1 | tee regression_results.log | |
| # python extract_log_summary.py regression_results.log | |
| # | |
| # Models are saved to disk and NOT cleaned up for follow-up evaluation. | |
| set -o pipefail | |
| # Avoid permission errors on shared HF cache files | |
| export HF_DATASETS_CACHE="/tmp/hf_datasets_cache" | |
| mkdir -p "$HF_DATASETS_CACHE" | |
| # ── Configuration ──────────────────────────────────────────────────────────── | |
| # Each entry defines one (script, model, scheme, vllm_args) test configuration. | |
| # The scheme is tied to the script — no cross-product. | |
| SCRIPTS=( | |
| # "testing/llama3_fp8_block.py" | |
| # "testing/qwen25_32b_fp8_block.py" | |
| # "testing/qwen3_vl_fp8_block.py" | |
| # "testing/llama4_scout_fp8_block.py" | |
| # "testing/mixtral_fp8_block.py" | |
| "testing/llama3_w4a16_gptq.py" | |
| "testing/llama3_w8a16_gptq.py" | |
| ) | |
| MODEL_SHORT_NAMES=( | |
| # "Meta-Llama-3-8B-Instruct" | |
| # "Qwen2.5-32B-Instruct" | |
| # "Qwen3-VL-8B-Instruct" | |
| # "Llama-4-Scout-17B-16E-Instruct" | |
| # "Mixtral-8x7B-Instruct-v0.1" | |
| "Meta-Llama-3-8B-Instruct" | |
| "Meta-Llama-3-8B-Instruct" | |
| ) | |
| # Scheme label per entry (used for naming and CSV output) | |
| MODEL_SCHEMES=( | |
| # "FP8_BLOCK" | |
| # "FP8_BLOCK" | |
| # "FP8_BLOCK" | |
| # "FP8_BLOCK" | |
| # "FP8_BLOCK" | |
| "W4A16" | |
| "W8A16" | |
| ) | |
| # vLLM eval settings per entry: max_model_len,tensor_parallel_size,num_gpus_quant | |
| MODEL_VLLM_ARGS=( | |
| # "2048,1,1" | |
| # "4096,2,1" | |
| # "4096,1,1" | |
| # "4096,2,1" | |
| # "2048,2,1" | |
| "2048,1,1" | |
| "2048,1,1" | |
| ) | |
| # without-actorder: no actorder flag (standard GPTQ) | |
| # with-actorder: actorder=weight | |
| # with-group-actorder: actorder=group | |
| ACTORDER_STATES=("without-actorder" "with-actorder" "with-group-actorder") | |
| # eval_name lm_eval_task fewshot backend | |
| # gsm8k gsm8k 5 vllm | |
| # gsm8k_platinum gsm8k_platinum 5 vllm | |
| # wikitext wikitext 0 vllm | |
| # mmlu mmlu 5 vllm | |
| EVAL_NAMES=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu") | |
| EVAL_LM_TASKS=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu") | |
| EVAL_FEWSHOT=("5" "5" "0" "5") | |
| EVAL_BACKENDS=("vllm" "vllm" "vllm" "vllm") | |
| EVAL_BASE_DIR="./eval_results" | |
| MODEL_BASE_DIR="./regression_models" | |
| RESULTS_CSV="regression_results.csv" | |
| mkdir -p "$EVAL_BASE_DIR" "$MODEL_BASE_DIR" | |
| # ── Helper: activate environments ──────────────────────────────────────────── | |
| activate_quant_env() { | |
| source /home/HDCharles/rhdev/bin/activate | |
| } | |
| activate_eval_env() { | |
| source /home/HDCharles/vllm/bin/activate | |
| } | |
| # ── Helper: run vLLM evaluation with fallback chain ────────────────────────── | |
| EVAL_BACKEND="" # set by run_vllm_eval to indicate which backend succeeded | |
| run_vllm_eval() { | |
| local save_dir=$1 | |
| local task=$2 | |
| local num_fewshot=$3 | |
| local max_model_len=$4 | |
| local tp_size=$5 | |
| local eval_output_dir=$6 | |
| mkdir -p "$eval_output_dir" | |
| EVAL_BACKEND="FAILED" | |
| activate_eval_env | |
| echo " EVAL: $task (fewshot=$num_fewshot, tp=$tp_size, max_len=$max_model_len)" | |
| # Build common eval flags | |
| local chat_args="--apply_chat_template" | |
| if [ "$num_fewshot" -gt 0 ]; then | |
| chat_args="$chat_args --fewshot_as_multiturn" | |
| fi | |
| # Try with tensor_parallel | |
| if [ "$tp_size" -gt 1 ]; then | |
| lm_eval \ | |
| --model vllm \ | |
| --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,tensor_parallel_size=$tp_size,gpu_memory_utilization=0.85" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp${tp_size}"; return 0; fi | |
| echo " TP=$tp_size failed, trying expert_parallel..." | |
| lm_eval \ | |
| --model vllm \ | |
| --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enable_expert_parallel=True,gpu_memory_utilization=0.85" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_expert_parallel"; return 0; fi | |
| fi | |
| echo " Trying TP=1..." | |
| lm_eval \ | |
| --model vllm \ | |
| --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,gpu_memory_utilization=0.85" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp1"; return 0; fi | |
| echo " Trying enforce_eager..." | |
| lm_eval \ | |
| --model vllm \ | |
| --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enforce_eager=True,gpu_memory_utilization=0.85" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_eager"; return 0; fi | |
| echo " Trying hf backend as last resort..." | |
| lm_eval \ | |
| --model hf \ | |
| --model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi | |
| EVAL_BACKEND="FAILED" | |
| return 1 | |
| } | |
| # ── Helper: run HF-only evaluation ───────────────────────────────────────── | |
| run_hf_eval() { | |
| local save_dir=$1 | |
| local task=$2 | |
| local num_fewshot=$3 | |
| local eval_output_dir=$4 | |
| mkdir -p "$eval_output_dir" | |
| EVAL_BACKEND="FAILED" | |
| activate_eval_env | |
| echo " EVAL: $task (fewshot=$num_fewshot, backend=hf)" | |
| # Build common eval flags | |
| local chat_args="--apply_chat_template" | |
| if [ "$num_fewshot" -gt 0 ]; then | |
| chat_args="$chat_args --fewshot_as_multiturn" | |
| fi | |
| lm_eval \ | |
| --model hf \ | |
| --model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi | |
| EVAL_BACKEND="FAILED" | |
| return 1 | |
| } | |
| # ── Helper: extract metric from lm_eval JSON results ──────────────────────── | |
| extract_metric() { | |
| local eval_output_dir=$1 | |
| local task=$2 | |
| # Find the most recent results JSON in the eval output dir | |
| local results_json | |
| results_json=$(find "$eval_output_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1) | |
| if [ -z "$results_json" ]; then | |
| echo "N/A" | |
| return | |
| fi | |
| python3 -c " | |
| import json, sys | |
| with open('$results_json') as f: | |
| data = json.load(f) | |
| results = data.get('results', {}) | |
| task = '$task' | |
| # Handle task name variations (e.g., wikitext vs wikitext2) | |
| task_results = None | |
| for key in results: | |
| if task in key: | |
| task_results = results[key] | |
| break | |
| if task_results is None: | |
| print('N/A') | |
| sys.exit() | |
| # Extract the primary metric for each task | |
| if 'gsm8k' in task: | |
| val = task_results.get('exact_match,strict-match') | |
| if val is not None: | |
| print(f'{val*100:.2f}%') | |
| else: | |
| print('N/A') | |
| elif 'wikitext' in task: | |
| val = task_results.get('word_perplexity,none') | |
| if val is not None: | |
| print(f'{val:.2f}') | |
| else: | |
| print('N/A') | |
| elif 'mmlu' in task: | |
| val = task_results.get('acc,none') | |
| if val is not None: | |
| print(f'{val*100:.2f}%') | |
| else: | |
| print('N/A') | |
| else: | |
| # Generic: grab first non-stderr, non-alias metric | |
| for k, v in task_results.items(): | |
| if 'stderr' not in k and k != 'alias' and isinstance(v, (int, float)): | |
| print(f'{v:.4f}') | |
| sys.exit() | |
| print('N/A') | |
| " 2>/dev/null || echo "N/A" | |
| } | |
| # ── Helper: print current results summary ──────────────────────────────────── | |
| print_summary() { | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ RESULTS SUMMARY (so far) ║" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| if [ -f "$RESULTS_CSV" ]; then | |
| # Print header + all rows as a formatted table | |
| column -t -s',' < "$RESULTS_CSV" | |
| else | |
| echo "(no results yet)" | |
| fi | |
| echo "" | |
| echo "════════════════════════════════════════════════════════════════════════════════════════════════════════" | |
| echo "" | |
| } | |
| # ── Helper: print actorder comparison table ───────────────────────────────── | |
| print_comparison() { | |
| if [ ! -f "$RESULTS_CSV" ]; then | |
| return | |
| fi | |
| python3 - "$RESULTS_CSV" <<'PYEOF' | |
| import csv, sys | |
| csv_path = sys.argv[1] | |
| rows = [] | |
| with open(csv_path) as f: | |
| reader = csv.DictReader(f) | |
| for r in reader: | |
| rows.append(r) | |
| if not rows: | |
| sys.exit() | |
| actorder_keys = ["without-actorder", "with-actorder", "with-group-actorder"] | |
| # Build lookup: (model, scheme, task) -> {actorder_state: metric} | |
| lookup = {} | |
| for r in rows: | |
| key = (r["model"], r["scheme"], r["task"]) | |
| lookup.setdefault(key, {}) | |
| lookup[key][r["actorder"]] = r["metric"] | |
| # Only print if we have at least one row with baseline + one other | |
| entries = [(k, v) for k, v in lookup.items() | |
| if "without-actorder" in v and | |
| any(s in v for s in actorder_keys[1:])] | |
| if not entries: | |
| sys.exit() | |
| def parse_metric(s): | |
| s = s.strip() | |
| if s.endswith("%"): | |
| return float(s[:-1]), True | |
| try: | |
| return float(s), False | |
| except ValueError: | |
| return None, False | |
| def calc_improvement(baseline_str, compare_str, task): | |
| b_val, _ = parse_metric(baseline_str) | |
| c_val, _ = parse_metric(compare_str) | |
| if b_val is None or c_val is None or b_val == 0: | |
| return "N/A" | |
| if "wikitext" in task: | |
| pct = (b_val - c_val) / b_val * 100 | |
| else: | |
| pct = (c_val - b_val) / b_val * 100 | |
| sign = "+" if pct >= 0 else "" | |
| return f"{sign}{pct:.2f}%" | |
| print("") | |
| print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗") | |
| print("║ ACTORDER COMPARISON (vs without-actorder baseline) ║") | |
| print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝") | |
| print("") | |
| header = (f"{'model':<36} {'scheme':<12} {'task':<18} " | |
| f"{'no-actorder':>14} " | |
| f"{'weight':>14} {'wt vs base':>12} " | |
| f"{'group':>14} {'grp vs base':>12}") | |
| print(header) | |
| print("-" * len(header)) | |
| for (model, scheme, task), metrics in sorted(entries): | |
| wo = metrics.get("without-actorder", "") | |
| wi = metrics.get("with-actorder", "") | |
| wg = metrics.get("with-group-actorder", "") | |
| wi_imp = calc_improvement(wo, wi, task) if wo and wi else "" | |
| wg_imp = calc_improvement(wo, wg, task) if wo and wg else "" | |
| print(f"{model:<36} {scheme:<12} {task:<18} " | |
| f"{wo:>14} " | |
| f"{wi:>14} {wi_imp:>12} " | |
| f"{wg:>14} {wg_imp:>12}") | |
| print("") | |
| PYEOF | |
| } | |
| # ── Initialize results CSV (preserve previous results) ────────────────────── | |
| if [ -f "$RESULTS_CSV" ]; then | |
| cp "$RESULTS_CSV" "${RESULTS_CSV}.bak" | |
| fi | |
| echo "model,scheme,actorder,task,metric,status,eval_backend,save_dir" > "$RESULTS_CSV" | |
| # ── Main loop ──────────────────────────────────────────────────────────────── | |
| TOTAL=0 | |
| PASSED=0 | |
| FAILED=0 | |
| for model_idx in "${!SCRIPTS[@]}"; do | |
| script="${SCRIPTS[$model_idx]}" | |
| model_name="${MODEL_SHORT_NAMES[$model_idx]}" | |
| scheme="${MODEL_SCHEMES[$model_idx]}" | |
| IFS=',' read -r max_model_len tp_size num_gpus_quant <<< "${MODEL_VLLM_ARGS[$model_idx]}" | |
| for actorder_state in "${ACTORDER_STATES[@]}"; do | |
| save_dir="$MODEL_BASE_DIR/${model_name}-${scheme}-${actorder_state}" | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ MODEL: $model_name" | |
| echo "║ SCHEME: $scheme" | |
| echo "║ ACTORDER: $actorder_state" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| # ── Skip entirely if all evals already have results ──── | |
| all_evals_cached=true | |
| for eval_idx in "${!EVAL_NAMES[@]}"; do | |
| eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${actorder_state}/${EVAL_NAMES[$eval_idx]}" | |
| if ! find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | grep -q .; then | |
| all_evals_cached=false | |
| break | |
| fi | |
| done | |
| if [ "$all_evals_cached" = true ]; then | |
| echo "All evals already cached, skipping quantization and eval." | |
| for eval_idx in "${!EVAL_NAMES[@]}"; do | |
| eval_name="${EVAL_NAMES[$eval_idx]}" | |
| lm_task="${EVAL_LM_TASKS[$eval_idx]}" | |
| eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${actorder_state}/${eval_name}" | |
| metric_val=$(extract_metric "$eval_dir" "$lm_task") | |
| echo " $eval_name: $metric_val" | |
| echo "$model_name,$scheme,$actorder_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV" | |
| PASSED=$((PASSED + 1)) | |
| TOTAL=$((TOTAL + 1)) | |
| done | |
| print_summary | |
| print_comparison | |
| continue | |
| fi | |
| # ── Quantize (skip if model already exists) ──────────── | |
| if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then | |
| echo "Quantized model already exists at $save_dir, skipping quantization." | |
| else | |
| activate_quant_env | |
| echo "============================================" | |
| echo "Running: $script (actorder_state=$actorder_state)" | |
| echo "============================================" | |
| # Build actorder argument | |
| actorder_arg="" | |
| if [ "$actorder_state" == "with-actorder" ]; then | |
| actorder_arg="--actorder weight" | |
| elif [ "$actorder_state" == "with-group-actorder" ]; then | |
| actorder_arg="--actorder group" | |
| fi | |
| if [ "$num_gpus_quant" -gt 1 ]; then | |
| torchrun --nproc_per_node="$num_gpus_quant" "$script" \ | |
| $actorder_arg --save-dir "$save_dir" 2>&1 | |
| else | |
| python "$script" $actorder_arg --save-dir "$save_dir" 2>&1 | |
| fi | |
| quant_status=$? | |
| if [ $quant_status -ne 0 ]; then | |
| echo "QUANTIZATION FAILED for $model_name / $scheme / $actorder_state" | |
| for eval_name in "${EVAL_NAMES[@]}"; do | |
| echo "$model_name,$scheme,$actorder_state,$eval_name,N/A,QUANT_FAILED,N/A,$save_dir" >> "$RESULTS_CSV" | |
| done | |
| FAILED=$((FAILED + ${#EVAL_NAMES[@]})) | |
| TOTAL=$((TOTAL + ${#EVAL_NAMES[@]})) | |
| print_summary | |
| print_comparison | |
| continue | |
| fi | |
| fi | |
| # ── Clear GPU memory before eval ───────────────────────── | |
| python3 -c "import torch; torch.cuda.empty_cache(); [torch.cuda.reset_peak_memory_stats(i) for i in range(torch.cuda.device_count())]" 2>/dev/null | |
| # ── Evaluate ───────────────────────────────────────────── | |
| for eval_idx in "${!EVAL_NAMES[@]}"; do | |
| eval_name="${EVAL_NAMES[$eval_idx]}" | |
| lm_task="${EVAL_LM_TASKS[$eval_idx]}" | |
| fewshot="${EVAL_FEWSHOT[$eval_idx]}" | |
| backend="${EVAL_BACKENDS[$eval_idx]}" | |
| eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${actorder_state}/${eval_name}" | |
| TOTAL=$((TOTAL + 1)) | |
| # Skip eval if results already exist | |
| existing_result=$(find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1) | |
| if [ -n "$existing_result" ]; then | |
| metric_val=$(extract_metric "$eval_dir" "$lm_task") | |
| echo " EVAL: $eval_name — skipping, previous result found: $metric_val" | |
| echo "$model_name,$scheme,$actorder_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV" | |
| PASSED=$((PASSED + 1)) | |
| continue | |
| fi | |
| if [ "$backend" == "hf" ]; then | |
| run_hf_eval "$save_dir" "$lm_task" "$fewshot" "$eval_dir" | |
| else | |
| run_vllm_eval "$save_dir" "$lm_task" "$fewshot" "$max_model_len" "$tp_size" "$eval_dir" | |
| fi | |
| eval_status=$? | |
| if [ $eval_status -eq 0 ]; then | |
| metric_val=$(extract_metric "$eval_dir" "$lm_task") | |
| echo "$model_name,$scheme,$actorder_state,$eval_name,$metric_val,PASSED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV" | |
| PASSED=$((PASSED + 1)) | |
| else | |
| echo "$model_name,$scheme,$actorder_state,$eval_name,N/A,FAILED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV" | |
| FAILED=$((FAILED + 1)) | |
| fi | |
| done | |
| # ── Clean up model to free disk space ──────────────────── | |
| if [ -d "$save_dir" ]; then | |
| echo "Removing quantized model at $save_dir to free disk space." | |
| rm -rf "$save_dir" | |
| fi | |
| print_summary | |
| print_comparison | |
| done # actorder_state | |
| done # model | |
| # ── Final Summary ──────────────────────────────────────────────────────────── | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ FINAL SUMMARY: $PASSED passed, $FAILED failed out of $TOTAL total evaluations ║" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| print_summary | |
| print_comparison | |
| echo "Results CSV: $RESULTS_CSV" | |
| echo "Saved models: $MODEL_BASE_DIR/" | |
| echo "Eval outputs: $EVAL_BASE_DIR/" | |
| echo "" | |
| echo "To extract detailed metrics from the log:" | |
| echo " python extract_log_summary.py regression_results.log" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment