Created
April 15, 2026 03:54
-
-
Save HDCharles/2a963d184f27b33bef06252dc4bb813d to your computer and use it in GitHub Desktop.
AWQ DDP Regression Test Suite - quantization + eval across models, schemes, and pre/post-DDP commits
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Extract summary data from AWQ DDP regression test log files. | |
| Parses the log output from run_all_tests.sh and produces a comparison table | |
| showing pre-DDP vs post-DDP results across models, schemes, and benchmarks. | |
| Usage: | |
| python extract_log_summary.py regression_results.log | |
| """ | |
| import re | |
| import sys | |
| from collections import defaultdict | |
| def extract_log_summary(log_path): | |
| with open(log_path, "r") as f: | |
| content = f.read() | |
| # Split into sections by the box-drawing delimiters | |
| section_pattern = re.compile( | |
| r"║\s+MODEL:\s+(.+?)\n" | |
| r"\s*║\s+SCHEME:\s+(.+?)\n" | |
| r"\s*║\s+CODE STATE:\s+(.+?)\n" | |
| r".*?" | |
| r"(?=║\s+MODEL:|╔══.*FINAL SUMMARY|\Z)", | |
| re.DOTALL, | |
| ) | |
| sections = section_pattern.findall(content) | |
| # results[model][scheme][code_state] = {task: {metric: value}} | |
| results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) | |
| timing = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) | |
| for model, scheme, code_state in sections: | |
| model = model.strip() | |
| scheme = scheme.strip() | |
| code_state = code_state.strip() | |
| # Find the body for this section | |
| pattern = re.escape(f"MODEL: {model}") + r".*?" + re.escape(f"CODE STATE: {code_state}") | |
| match = re.search(pattern, content, re.DOTALL) | |
| if not match: | |
| continue | |
| # Get everything after the match until the next section | |
| start = match.end() | |
| next_section = re.search(r"║\s+MODEL:", content[start:]) | |
| end = start + next_section.start() if next_section else len(content) | |
| body = content[start:end] | |
| # Extract timing | |
| time_match = re.search( | |
| r"Time:\s*([\d.]+)\s*minutes\s*\(([\d.]+)\s*seconds\)", body | |
| ) | |
| if time_match: | |
| timing[model][scheme][code_state]["time_min"] = float( | |
| time_match.group(1) | |
| ) | |
| gpu_match = re.search(r"Peak GPU Memory:\s*([\d.]+)\s*GB", body) | |
| if gpu_match: | |
| timing[model][scheme][code_state]["gpu_gb"] = float(gpu_match.group(1)) | |
| # Extract GSM8K flexible-extract | |
| flex_match = re.search( | |
| r"flexible-extract\|.*?\|.*?\|.*?\|([\d.]+)\|", body | |
| ) | |
| if flex_match: | |
| results[model][scheme][code_state]["gsm8k_flex"] = float( | |
| flex_match.group(1) | |
| ) | |
| # Extract GSM8K strict-match | |
| strict_match = re.search( | |
| r"strict-match\s*\|.*?\|.*?\|.*?\|([\d.]+)\|", body | |
| ) | |
| if strict_match: | |
| results[model][scheme][code_state]["gsm8k_strict"] = float( | |
| strict_match.group(1) | |
| ) | |
| # Extract wikitext word_perplexity | |
| ppl_match = re.search( | |
| r"word_perplexity\s*\|.*?\|.*?\|.*?\|([\d.]+)\|", body | |
| ) | |
| if ppl_match: | |
| results[model][scheme][code_state]["wikitext_ppl"] = float( | |
| ppl_match.group(1) | |
| ) | |
| # Extract MMLU accuracy | |
| mmlu_match = re.search( | |
| r"acc\s*\|.*?\|.*?\|.*?\|([\d.]+)\|", body | |
| ) | |
| if mmlu_match: | |
| results[model][scheme][code_state]["mmlu_acc"] = float( | |
| mmlu_match.group(1) | |
| ) | |
| # Print comparison table | |
| print(f"\nLog: {log_path}\n") | |
| metrics = ["gsm8k_flex", "gsm8k_strict", "wikitext_ppl", "mmlu_acc"] | |
| metric_labels = ["GSM8K Flex", "GSM8K Strict", "Wiki PPL", "MMLU Acc"] | |
| header = ( | |
| f"{'Model':<35} {'Scheme':<12} {'State':<10} " | |
| f"{'Time':>7} {'GPU':>6} " | |
| + " ".join(f"{m:>12}" for m in metric_labels) | |
| ) | |
| print(header) | |
| print("-" * len(header)) | |
| for model in sorted(results.keys()): | |
| for scheme in sorted(results[model].keys()): | |
| for code_state in ["pre-ddp", "post-ddp"]: | |
| r = results[model][scheme].get(code_state, {}) | |
| t = timing[model][scheme].get(code_state, {}) | |
| time_str = ( | |
| f"{t['time_min']:.1f}m" if "time_min" in t else "N/A" | |
| ) | |
| gpu_str = ( | |
| f"{t['gpu_gb']:.1f}G" if "gpu_gb" in t else "N/A" | |
| ) | |
| vals = [] | |
| for m in metrics: | |
| if m in r: | |
| vals.append(f"{r[m]:.4f}") | |
| else: | |
| vals.append("N/A") | |
| print( | |
| f"{model:<35} {scheme:<12} {code_state:<10} " | |
| f"{time_str:>7} {gpu_str:>6} " | |
| + " ".join(f"{v:>12}" for v in vals) | |
| ) | |
| # Print delta row | |
| pre = results[model][scheme].get("pre-ddp", {}) | |
| post = results[model][scheme].get("post-ddp", {}) | |
| deltas = [] | |
| for m in metrics: | |
| if m in pre and m in post: | |
| diff = post[m] - pre[m] | |
| sign = "+" if diff >= 0 else "" | |
| # For perplexity, lower is better so flip the sign indicator | |
| if m == "wikitext_ppl": | |
| indicator = " *" if diff > 0 else "" | |
| else: | |
| indicator = " *" if diff < -0.01 else "" | |
| deltas.append(f"{sign}{diff:.4f}{indicator}") | |
| else: | |
| deltas.append("---") | |
| print( | |
| f"{'':35} {'':12} {'delta':<10} " | |
| f"{'':>7} {'':>6} " | |
| + " ".join(f"{d:>12}" for d in deltas) | |
| ) | |
| print() | |
| if __name__ == "__main__": | |
| log_path = sys.argv[1] if len(sys.argv) > 1 else "regression_results.log" | |
| extract_log_summary(log_path) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.awq import AWQModifier | |
| MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| NUM_CALIBRATION_SAMPLES = 256 | |
| MAX_SEQUENCE_LENGTH = 512 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--scheme", default="W4A16_ASYM") | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| recipe = [ | |
| AWQModifier( | |
| ignore=["lm_head"], | |
| scheme=args.scheme, | |
| targets=["Linear"], | |
| duo_scaling="both", | |
| ), | |
| ] | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( | |
| model.device | |
| ) | |
| output = model.generate(input_ids, max_new_tokens=100) | |
| print(tokenizer.decode(output[0])) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from datasets import load_dataset | |
| from transformers import Llama4ForConditionalGeneration, Llama4Processor | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.awq import AWQModifier | |
| from llmcompressor.modifiers.awq.mappings import AWQMapping | |
| MODEL_ID = "meta-llama/Llama-4-Scout-17B-16E-Instruct" | |
| DATASET_ID = "neuralmagic/calibration" | |
| NUM_CALIBRATION_SAMPLES = 512 | |
| MAX_SEQUENCE_LENGTH = 8192 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--scheme", default="W4A16_ASYM") | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = Llama4ForConditionalGeneration.from_pretrained(MODEL_ID, dtype="auto") | |
| processor = Llama4Processor.from_pretrained(MODEL_ID) | |
| ds = load_dataset( | |
| DATASET_ID, name="LLM", split=f"train[:{num_samples}]" | |
| ) | |
| def preprocess_function(example): | |
| messages = [] | |
| for message in example["messages"]: | |
| messages.append( | |
| { | |
| "role": message["role"], | |
| "content": [{"type": "text", "text": message["content"]}], | |
| } | |
| ) | |
| return processor.apply_chat_template( | |
| messages, | |
| return_tensors="pt", | |
| padding=False, | |
| truncation=True, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| tokenize=True, | |
| add_special_tokens=False, | |
| return_dict=True, | |
| add_generation_prompt=False, | |
| ) | |
| ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names) | |
| def data_collator(batch): | |
| assert len(batch) == 1 | |
| return { | |
| key: ( | |
| torch.tensor(value) | |
| if key != "pixel_values" | |
| else torch.tensor(value, dtype=torch.bfloat16).squeeze(0) | |
| ) | |
| for key, value in batch[0].items() | |
| } | |
| # Llama-4-Scout has both vision_model and language_model sub-models, | |
| # so mappings must be scoped to language_model to avoid dual matches. | |
| # The main experts use a fused gate_up_proj (not Linear), so only | |
| # shared_expert Linear layers are AWQ targets. | |
| recipe = AWQModifier( | |
| targets="Linear", | |
| scheme=args.scheme, | |
| ignore=[ | |
| "re:.*lm_head", | |
| "re:.*self_attn", | |
| "re:.*router", | |
| "re:.*vision_model.*", | |
| "re:.*multi_modal_projector.*", | |
| "Llama4TextAttention", | |
| ], | |
| mappings=[ | |
| AWQMapping( | |
| "re:.*language_model.*post_attention_layernorm$", | |
| [ | |
| "re:.*shared_expert.gate_proj$", | |
| "re:.*shared_expert.up_proj$", | |
| ], | |
| ), | |
| AWQMapping( | |
| "re:.*shared_expert.up_proj$", | |
| ["re:.*shared_expert.down_proj$"], | |
| ), | |
| ], | |
| ) | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| data_collator=data_collator, | |
| sequential_targets=["Llama4TextMLP"], | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| processor.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.awq import AWQModifier | |
| from llmcompressor.modifiers.awq.mappings import AWQMapping | |
| MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1" | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| NUM_CALIBRATION_SAMPLES = 256 | |
| MAX_SEQUENCE_LENGTH = 512 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--scheme", default="W4A16_ASYM") | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, dtype=torch.bfloat16, trust_remote_code=True | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| # Mixtral uses w1/w2/w3 naming for expert layers instead of | |
| # gate_proj/up_proj/down_proj, so we need custom mappings | |
| recipe = [ | |
| AWQModifier( | |
| ignore=[ | |
| "lm_head", | |
| "re:.*block_sparse_moe.gate", | |
| ], | |
| scheme=args.scheme, | |
| targets=["Linear"], | |
| duo_scaling="both", | |
| mappings=[ | |
| AWQMapping( | |
| "re:.*input_layernorm$", | |
| ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"], | |
| ), | |
| AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]), | |
| AWQMapping( | |
| "re:.*post_attention_layernorm$", | |
| [ | |
| "re:.*block_sparse_moe.experts.*.w1$", | |
| "re:.*block_sparse_moe.experts.*.w3$", | |
| ], | |
| ), | |
| AWQMapping("re:.*w3$", ["re:.*w2$"]), | |
| ], | |
| ), | |
| ] | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| trust_remote_code_model=True, | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( | |
| model.device | |
| ) | |
| output = model.generate(input_ids, max_new_tokens=100) | |
| print(tokenizer.decode(output[0])) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import time | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.awq import AWQModifier | |
| MODEL_ID = "Qwen/Qwen2.5-32B-Instruct" | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| NUM_CALIBRATION_SAMPLES = 256 | |
| MAX_SEQUENCE_LENGTH = 512 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--scheme", default="W4A16_ASYM") | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| recipe = [ | |
| AWQModifier( | |
| ignore=["lm_head"], | |
| scheme=args.scheme, | |
| targets=["Linear"], | |
| duo_scaling="both", | |
| ), | |
| ] | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( | |
| model.device | |
| ) | |
| output = model.generate(input_ids, max_new_tokens=100) | |
| print(tokenizer.decode(output[0])) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import base64 | |
| import time | |
| from io import BytesIO | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from datasets import load_dataset | |
| from qwen_vl_utils import process_vision_info | |
| from transformers import AutoProcessor, Qwen3VLForConditionalGeneration | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.awq import AWQModifier | |
| MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct" | |
| DATASET_ID = "lmms-lab/flickr30k" | |
| DATASET_SPLIT = "test" | |
| NUM_CALIBRATION_SAMPLES = 512 | |
| MAX_SEQUENCE_LENGTH = 2048 | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--scheme", default="W4A16_ASYM") | |
| parser.add_argument("--save-dir", default=None) | |
| parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES) | |
| args = parser.parse_args() | |
| num_samples = args.num_samples | |
| model = Qwen3VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID, torch_dtype="auto" | |
| ) | |
| processor = AutoProcessor.from_pretrained(MODEL_ID) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess_and_tokenize(example): | |
| buffered = BytesIO() | |
| example["image"].save(buffered, format="PNG") | |
| encoded_image = base64.b64encode(buffered.getvalue()).decode("utf-8") | |
| base64_qwen = f"data:image;base64,{encoded_image}" | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": base64_qwen}, | |
| {"type": "text", "text": "What does the image show?"}, | |
| ], | |
| } | |
| ] | |
| text = processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| return processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| ) | |
| ds = ds.map(preprocess_and_tokenize, remove_columns=ds.column_names) | |
| def data_collator(batch): | |
| assert len(batch) == 1 | |
| return {key: torch.tensor(value) for key, value in batch[0].items()} | |
| recipe = AWQModifier( | |
| scheme=args.scheme, | |
| ignore=["re:.*lm_head", "re:.*visual.*"], | |
| duo_scaling=False, | |
| ) | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot( | |
| model=model, | |
| tokenizer=MODEL_ID, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=MAX_SEQUENCE_LENGTH, | |
| num_calibration_samples=num_samples, | |
| data_collator=data_collator, | |
| sequential_targets=["Qwen3VLTextDecoderLayer"], | |
| ) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| print("\n\n") | |
| print("========== SAMPLE GENERATION ==============") | |
| dispatch_model(model) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image", | |
| "image": "http://images.cocodataset.org/train2017/000000231895.jpg", | |
| }, | |
| {"type": "text", "text": "Please describe the animal in this image\n"}, | |
| ], | |
| } | |
| ] | |
| prompt = processor.apply_chat_template(messages, add_generation_prompt=True) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = processor( | |
| text=[prompt], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=False, | |
| max_length=MAX_SEQUENCE_LENGTH, | |
| truncation=True, | |
| return_tensors="pt", | |
| ).to(model.device) | |
| output = model.generate(**inputs, max_new_tokens=100) | |
| print(processor.decode(output[0], skip_special_tokens=True)) | |
| print("==========================================\n\n") | |
| save_dir = args.save_dir or ( | |
| MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}" | |
| ) | |
| model.save_pretrained(save_dir, save_compressed=True) | |
| processor.save_pretrained(save_dir) | |
| print(f"Model saved to {save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # AWQ DDP Regression Test Suite | |
| # Compares pre-DDP vs post-DDP AWQ quality across models, formats, and benchmarks. | |
| # | |
| # Usage: | |
| # ./run_all_tests.sh 2>&1 | tee regression_results.log | |
| # python extract_log_summary.py regression_results.log | |
| # | |
| # Models are saved to disk and NOT cleaned up for follow-up evaluation. | |
| set -o pipefail | |
| # Avoid permission errors on shared HF cache files | |
| export HF_DATASETS_CACHE="/tmp/hf_datasets_cache" | |
| mkdir -p "$HF_DATASETS_CACHE" | |
| # ── Configuration ──────────────────────────────────────────────────────────── | |
| PRE_DDP_COMMIT="2ab02443" | |
| POST_DDP_COMMIT="0bc916e5" | |
| # Backup regression test scripts (they don't exist in old commits) | |
| BACKUP_DIR="/tmp/awq_regression_tests_backup" | |
| mkdir -p "$BACKUP_DIR" | |
| cp -r examples/awq/regression_tests/* "$BACKUP_DIR/" 2>/dev/null || true | |
| SCRIPTS=( | |
| "examples/awq/regression_tests/llama3_awq.py" | |
| "examples/awq/regression_tests/qwen3_vl_awq.py" | |
| "examples/awq/regression_tests/llama4_scout_awq.py" | |
| "examples/awq/regression_tests/qwen25_32b_awq.py" | |
| "examples/awq/regression_tests/mixtral_awq.py" | |
| ) | |
| MODEL_SHORT_NAMES=( | |
| "Meta-Llama-3-8B-Instruct" | |
| "Qwen3-VL-8B-Instruct" | |
| "Llama-4-Scout-17B-16E-Instruct" | |
| "Qwen2.5-32B-Instruct" | |
| "Mixtral-8x7B-Instruct-v0.1" | |
| ) | |
| # vLLM eval settings per model: max_model_len,tensor_parallel_size,num_gpus_quant | |
| MODEL_VLLM_ARGS=( | |
| "2048,1,1" | |
| "4096,1,1" | |
| "4096,2,1" | |
| "4096,2,1" | |
| "2048,2,1" | |
| ) | |
| SCHEMES=("W4A16_ASYM" "W8A8" "W8A16") | |
| # eval_name lm_eval_task fewshot backend | |
| # gsm8k gsm8k 5 vllm | |
| # gsm8k_platinum gsm8k_platinum 5 vllm | |
| # wikitext wikitext 0 vllm | |
| # mmlu mmlu 5 vllm | |
| EVAL_NAMES=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu") | |
| EVAL_LM_TASKS=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu") | |
| EVAL_FEWSHOT=("5" "5" "0" "5") | |
| EVAL_BACKENDS=("vllm" "vllm" "vllm" "vllm") | |
| CODE_STATES=("pre-ddp" "post-ddp") | |
| EVAL_BASE_DIR="./eval_results" | |
| MODEL_BASE_DIR="./regression_models" | |
| RESULTS_CSV="regression_results.csv" | |
| mkdir -p "$EVAL_BASE_DIR" "$MODEL_BASE_DIR" | |
| # ── Helper: activate environments ──────────────────────────────────────────── | |
| activate_quant_env() { | |
| source /home/HDCharles/rhdev/bin/activate | |
| } | |
| activate_eval_env() { | |
| source /home/HDCharles/vllm/bin/activate | |
| } | |
| # ── Helper: run vLLM evaluation with fallback chain ────────────────────────── | |
| EVAL_BACKEND="" # set by run_vllm_eval to indicate which backend succeeded | |
| run_vllm_eval() { | |
| local save_dir=$1 | |
| local task=$2 | |
| local num_fewshot=$3 | |
| local max_model_len=$4 | |
| local tp_size=$5 | |
| local eval_output_dir=$6 | |
| mkdir -p "$eval_output_dir" | |
| EVAL_BACKEND="FAILED" | |
| activate_eval_env | |
| echo " EVAL: $task (fewshot=$num_fewshot, tp=$tp_size, max_len=$max_model_len)" | |
| # Build common eval flags | |
| local chat_args="--apply_chat_template" | |
| if [ "$num_fewshot" -gt 0 ]; then | |
| chat_args="$chat_args --fewshot_as_multiturn" | |
| fi | |
| # Try with tensor_parallel | |
| if [ "$tp_size" -gt 1 ]; then | |
| lm_eval \ | |
| --model vllm \ | |
| --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,tensor_parallel_size=$tp_size,gpu_memory_utilization=0.85" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp${tp_size}"; return 0; fi | |
| echo " TP=$tp_size failed, trying expert_parallel..." | |
| lm_eval \ | |
| --model vllm \ | |
| --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enable_expert_parallel=True,gpu_memory_utilization=0.85" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_expert_parallel"; return 0; fi | |
| fi | |
| echo " Trying TP=1..." | |
| lm_eval \ | |
| --model vllm \ | |
| --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,gpu_memory_utilization=0.85" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp1"; return 0; fi | |
| echo " Trying enforce_eager..." | |
| lm_eval \ | |
| --model vllm \ | |
| --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enforce_eager=True,gpu_memory_utilization=0.85" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_eager"; return 0; fi | |
| echo " Trying hf backend as last resort..." | |
| lm_eval \ | |
| --model hf \ | |
| --model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi | |
| EVAL_BACKEND="FAILED" | |
| return 1 | |
| } | |
| # ── Helper: run HF-only evaluation ───────────────────────────────────────── | |
| run_hf_eval() { | |
| local save_dir=$1 | |
| local task=$2 | |
| local num_fewshot=$3 | |
| local eval_output_dir=$4 | |
| mkdir -p "$eval_output_dir" | |
| EVAL_BACKEND="FAILED" | |
| activate_eval_env | |
| echo " EVAL: $task (fewshot=$num_fewshot, backend=hf)" | |
| # Build common eval flags | |
| local chat_args="--apply_chat_template" | |
| if [ "$num_fewshot" -gt 0 ]; then | |
| chat_args="$chat_args --fewshot_as_multiturn" | |
| fi | |
| lm_eval \ | |
| --model hf \ | |
| --model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi | |
| EVAL_BACKEND="FAILED" | |
| return 1 | |
| } | |
| # ── Helper: extract metric from lm_eval JSON results ──────────────────────── | |
| extract_metric() { | |
| local eval_output_dir=$1 | |
| local task=$2 | |
| # Find the most recent results JSON in the eval output dir | |
| local results_json | |
| results_json=$(find "$eval_output_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1) | |
| if [ -z "$results_json" ]; then | |
| echo "N/A" | |
| return | |
| fi | |
| python3 -c " | |
| import json, sys | |
| with open('$results_json') as f: | |
| data = json.load(f) | |
| results = data.get('results', {}) | |
| task = '$task' | |
| # Handle task name variations (e.g., wikitext vs wikitext2) | |
| task_results = None | |
| for key in results: | |
| if task in key: | |
| task_results = results[key] | |
| break | |
| if task_results is None: | |
| print('N/A') | |
| sys.exit() | |
| # Extract the primary metric for each task | |
| if 'gsm8k' in task: | |
| val = task_results.get('exact_match,flexible-extract') | |
| if val is not None: | |
| print(f'{val*100:.2f}%') | |
| else: | |
| print('N/A') | |
| elif 'wikitext' in task: | |
| val = task_results.get('word_perplexity,none') | |
| if val is not None: | |
| print(f'{val:.2f}') | |
| else: | |
| print('N/A') | |
| elif 'mmlu' in task: | |
| val = task_results.get('acc,none') | |
| if val is not None: | |
| print(f'{val*100:.2f}%') | |
| else: | |
| print('N/A') | |
| else: | |
| # Generic: grab first non-stderr, non-alias metric | |
| for k, v in task_results.items(): | |
| if 'stderr' not in k and k != 'alias' and isinstance(v, (int, float)): | |
| print(f'{v:.4f}') | |
| sys.exit() | |
| print('N/A') | |
| " 2>/dev/null || echo "N/A" | |
| } | |
| # ── Helper: switch code state ──────────────────────────────────────────────── | |
| switch_code_state() { | |
| local state=$1 | |
| activate_quant_env | |
| if [ "$state" == "pre-ddp" ]; then | |
| echo "Switching to pre-DDP code ($PRE_DDP_COMMIT)..." | |
| git checkout "$PRE_DDP_COMMIT" 2>&1 | |
| # Restore test scripts as untracked files | |
| mkdir -p examples/awq/regression_tests | |
| cp -r "$BACKUP_DIR"/* examples/awq/regression_tests/ 2>/dev/null || true | |
| pip install -e . --quiet 2>&1 | |
| elif [ "$state" == "post-ddp" ]; then | |
| echo "Switching to post-DDP code ($POST_DDP_COMMIT)..." | |
| git checkout "$POST_DDP_COMMIT" 2>&1 | |
| # Restore test scripts as untracked files | |
| mkdir -p examples/awq/regression_tests | |
| cp -r "$BACKUP_DIR"/* examples/awq/regression_tests/ 2>/dev/null || true | |
| pip install -e . --quiet 2>&1 | |
| fi | |
| } | |
| # ── Helper: print current results summary ──────────────────────────────────── | |
| print_summary() { | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ RESULTS SUMMARY (so far) ║" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| if [ -f "$RESULTS_CSV" ]; then | |
| # Print header + all rows as a formatted table | |
| column -t -s',' < "$RESULTS_CSV" | |
| else | |
| echo "(no results yet)" | |
| fi | |
| echo "" | |
| echo "════════════════════════════════════════════════════════════════════════════════════════════════════════" | |
| echo "" | |
| } | |
| # ── Initialize results CSV (preserve previous results) ────────────────────── | |
| if [ -f "$RESULTS_CSV" ]; then | |
| cp "$RESULTS_CSV" "${RESULTS_CSV}.bak" | |
| fi | |
| echo "model,scheme,code_state,task,metric,status,eval_backend,save_dir" > "$RESULTS_CSV" | |
| # ── Main loop ──────────────────────────────────────────────────────────────── | |
| TOTAL=0 | |
| PASSED=0 | |
| FAILED=0 | |
| for model_idx in "${!SCRIPTS[@]}"; do | |
| script="${SCRIPTS[$model_idx]}" | |
| model_name="${MODEL_SHORT_NAMES[$model_idx]}" | |
| IFS=',' read -r max_model_len tp_size num_gpus_quant <<< "${MODEL_VLLM_ARGS[$model_idx]}" | |
| for scheme in "${SCHEMES[@]}"; do | |
| for code_state in "${CODE_STATES[@]}"; do | |
| save_dir="$MODEL_BASE_DIR/${model_name}-${scheme}-${code_state}" | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ MODEL: $model_name" | |
| echo "║ SCHEME: $scheme" | |
| echo "║ CODE STATE: $code_state" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| # ── Skip entirely if all evals already have results ──── | |
| all_evals_cached=true | |
| for eval_idx in "${!EVAL_NAMES[@]}"; do | |
| eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${code_state}/${EVAL_NAMES[$eval_idx]}" | |
| if ! find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | grep -q .; then | |
| all_evals_cached=false | |
| break | |
| fi | |
| done | |
| if [ "$all_evals_cached" = true ]; then | |
| echo "All evals already cached, skipping quantization and eval." | |
| for eval_idx in "${!EVAL_NAMES[@]}"; do | |
| eval_name="${EVAL_NAMES[$eval_idx]}" | |
| lm_task="${EVAL_LM_TASKS[$eval_idx]}" | |
| eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${code_state}/${eval_name}" | |
| metric_val=$(extract_metric "$eval_dir" "$lm_task") | |
| echo " $eval_name: $metric_val" | |
| echo "$model_name,$scheme,$code_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV" | |
| PASSED=$((PASSED + 1)) | |
| TOTAL=$((TOTAL + 1)) | |
| done | |
| print_summary | |
| continue | |
| fi | |
| # ── Quantize (skip if model already exists) ──────────── | |
| if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then | |
| echo "Quantized model already exists at $save_dir, skipping quantization." | |
| else | |
| # Switch code state | |
| switch_code_state "$code_state" | |
| echo "============================================" | |
| echo "Running: $script --scheme $scheme" | |
| echo "============================================" | |
| if [ "$num_gpus_quant" -gt 1 ]; then | |
| torchrun --nproc_per_node="$num_gpus_quant" "$script" \ | |
| --scheme "$scheme" --save-dir "$save_dir" 2>&1 | |
| else | |
| python "$script" --scheme "$scheme" --save-dir "$save_dir" 2>&1 | |
| fi | |
| quant_status=$? | |
| if [ $quant_status -ne 0 ]; then | |
| echo "QUANTIZATION FAILED for $model_name / $scheme / $code_state" | |
| for eval_name in "${EVAL_NAMES[@]}"; do | |
| echo "$model_name,$scheme,$code_state,$eval_name,N/A,QUANT_FAILED,N/A,$save_dir" >> "$RESULTS_CSV" | |
| done | |
| FAILED=$((FAILED + ${#EVAL_NAMES[@]})) | |
| TOTAL=$((TOTAL + ${#EVAL_NAMES[@]})) | |
| print_summary | |
| continue | |
| fi | |
| fi | |
| # ── Clear GPU memory before eval ───────────────────────── | |
| python3 -c "import torch; torch.cuda.empty_cache(); [torch.cuda.reset_peak_memory_stats(i) for i in range(torch.cuda.device_count())]" 2>/dev/null | |
| # ── Evaluate ───────────────────────────────────────────── | |
| for eval_idx in "${!EVAL_NAMES[@]}"; do | |
| eval_name="${EVAL_NAMES[$eval_idx]}" | |
| lm_task="${EVAL_LM_TASKS[$eval_idx]}" | |
| fewshot="${EVAL_FEWSHOT[$eval_idx]}" | |
| backend="${EVAL_BACKENDS[$eval_idx]}" | |
| eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${code_state}/${eval_name}" | |
| TOTAL=$((TOTAL + 1)) | |
| # Skip eval if results already exist | |
| existing_result=$(find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1) | |
| if [ -n "$existing_result" ]; then | |
| metric_val=$(extract_metric "$eval_dir" "$lm_task") | |
| echo " EVAL: $eval_name — skipping, previous result found: $metric_val" | |
| echo "$model_name,$scheme,$code_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV" | |
| PASSED=$((PASSED + 1)) | |
| continue | |
| fi | |
| if [ "$backend" == "hf" ]; then | |
| run_hf_eval "$save_dir" "$lm_task" "$fewshot" "$eval_dir" | |
| else | |
| run_vllm_eval "$save_dir" "$lm_task" "$fewshot" "$max_model_len" "$tp_size" "$eval_dir" | |
| fi | |
| eval_status=$? | |
| if [ $eval_status -eq 0 ]; then | |
| metric_val=$(extract_metric "$eval_dir" "$lm_task") | |
| echo "$model_name,$scheme,$code_state,$eval_name,$metric_val,PASSED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV" | |
| PASSED=$((PASSED + 1)) | |
| else | |
| echo "$model_name,$scheme,$code_state,$eval_name,N/A,FAILED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV" | |
| FAILED=$((FAILED + 1)) | |
| fi | |
| done | |
| # ── Clean up model to free disk space ──────────────────── | |
| if [ -d "$save_dir" ]; then | |
| echo "Removing quantized model at $save_dir to free disk space." | |
| rm -rf "$save_dir" | |
| fi | |
| print_summary | |
| done # code_state | |
| done # scheme | |
| done # model | |
| # ── Final Summary ──────────────────────────────────────────────────────────── | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ FINAL SUMMARY: $PASSED passed, $FAILED failed out of $TOTAL total evaluations ║" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| print_summary | |
| echo "Results CSV: $RESULTS_CSV" | |
| echo "Saved models: $MODEL_BASE_DIR/" | |
| echo "Eval outputs: $EVAL_BASE_DIR/" | |
| echo "" | |
| echo "To extract detailed metrics from the log:" | |
| echo " python extract_log_summary.py regression_results.log" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment