HDCharles · April 15, 2026 03:54
diff --git a/extract_log_summary.py b/extract_log_summary.py
 #!/usr/bin/env python3
 """Extract summary data from AWQ DDP regression test log files.

 Parses the log output from run_all_tests.sh and produces a comparison table
 showing pre-DDP vs post-DDP results across models, schemes, and benchmarks.

 Usage:
    python extract_log_summary.py regression_results.log
 """

 import re
 import sys
 from collections import defaultdict


 def extract_log_summary(log_path):
    with open(log_path, "r") as f:
        content = f.read()

    # Split into sections by the box-drawing delimiters
    section_pattern = re.compile(
        r"║\s+MODEL:\s+(.+?)\n"
        r"\s*║\s+SCHEME:\s+(.+?)\n"
        r"\s*║\s+CODE STATE:\s+(.+?)\n"
        r".*?"
        r"(?=║\s+MODEL:|╔══.*FINAL SUMMARY|\Z)",
        re.DOTALL,
    )

    sections = section_pattern.findall(content)

    # results[model][scheme][code_state] = {task: {metric: value}}
    results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
    timing = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

    for model, scheme, code_state in sections:
        model = model.strip()
        scheme = scheme.strip()
        code_state = code_state.strip()

        # Find the body for this section
        pattern = re.escape(f"MODEL: {model}") + r".*?" + re.escape(f"CODE STATE: {code_state}")
        match = re.search(pattern, content, re.DOTALL)
        if not match:
            continue
        # Get everything after the match until the next section
        start = match.end()
        next_section = re.search(r"║\s+MODEL:", content[start:])
        end = start + next_section.start() if next_section else len(content)
        body = content[start:end]

        # Extract timing
        time_match = re.search(
            r"Time:\s*([\d.]+)\s*minutes\s*\(([\d.]+)\s*seconds\)", body
        )
        if time_match:
            timing[model][scheme][code_state]["time_min"] = float(
                time_match.group(1)
            )

        gpu_match = re.search(r"Peak GPU Memory:\s*([\d.]+)\s*GB", body)
        if gpu_match:
            timing[model][scheme][code_state]["gpu_gb"] = float(gpu_match.group(1))

        # Extract GSM8K flexible-extract
        flex_match = re.search(
            r"flexible-extract\|.*?\|.*?\|.*?\|([\d.]+)\|", body
        )
        if flex_match:
            results[model][scheme][code_state]["gsm8k_flex"] = float(
                flex_match.group(1)
            )

        # Extract GSM8K strict-match
        strict_match = re.search(
            r"strict-match\s*\|.*?\|.*?\|.*?\|([\d.]+)\|", body
        )
        if strict_match:
            results[model][scheme][code_state]["gsm8k_strict"] = float(
                strict_match.group(1)
            )

        # Extract wikitext word_perplexity
        ppl_match = re.search(
            r"word_perplexity\s*\|.*?\|.*?\|.*?\|([\d.]+)\|", body
        )
        if ppl_match:
            results[model][scheme][code_state]["wikitext_ppl"] = float(
                ppl_match.group(1)
            )

        # Extract MMLU accuracy
        mmlu_match = re.search(
            r"acc\s*\|.*?\|.*?\|.*?\|([\d.]+)\|", body
        )
        if mmlu_match:
            results[model][scheme][code_state]["mmlu_acc"] = float(
                mmlu_match.group(1)
            )

    # Print comparison table
    print(f"\nLog: {log_path}\n")

    metrics = ["gsm8k_flex", "gsm8k_strict", "wikitext_ppl", "mmlu_acc"]
    metric_labels = ["GSM8K Flex", "GSM8K Strict", "Wiki PPL", "MMLU Acc"]

    header = (
        f"{'Model':<35} {'Scheme':<12} {'State':<10} "
        f"{'Time':>7} {'GPU':>6} "
        + " ".join(f"{m:>12}" for m in metric_labels)
    )
    print(header)
    print("-" * len(header))

    for model in sorted(results.keys()):
        for scheme in sorted(results[model].keys()):
            for code_state in ["pre-ddp", "post-ddp"]:
                r = results[model][scheme].get(code_state, {})
                t = timing[model][scheme].get(code_state, {})

                time_str = (
                    f"{t['time_min']:.1f}m" if "time_min" in t else "N/A"
                )
                gpu_str = (
                    f"{t['gpu_gb']:.1f}G" if "gpu_gb" in t else "N/A"
                )

                vals = []
                for m in metrics:
                    if m in r:
                        vals.append(f"{r[m]:.4f}")
                    else:
                        vals.append("N/A")

                print(
                    f"{model:<35} {scheme:<12} {code_state:<10} "
                    f"{time_str:>7} {gpu_str:>6} "
                    + " ".join(f"{v:>12}" for v in vals)
                )

            # Print delta row
            pre = results[model][scheme].get("pre-ddp", {})
            post = results[model][scheme].get("post-ddp", {})
            deltas = []
            for m in metrics:
                if m in pre and m in post:
                    diff = post[m] - pre[m]
                    sign = "+" if diff >= 0 else ""
                    # For perplexity, lower is better so flip the sign indicator
                    if m == "wikitext_ppl":
                        indicator = " *" if diff > 0 else ""
                    else:
                        indicator = " *" if diff < -0.01 else ""
                    deltas.append(f"{sign}{diff:.4f}{indicator}")
                else:
                    deltas.append("---")

            print(
                f"{'':35} {'':12} {'delta':<10} "
                f"{'':>7} {'':>6} "
                + " ".join(f"{d:>12}" for d in deltas)
            )
            print()


 if __name__ == "__main__":
    log_path = sys.argv[1] if len(sys.argv) > 1 else "regression_results.log"
    extract_log_summary(log_path)
diff --git a/llama3_awq.py b/llama3_awq.py
 import argparse
 import time

 import torch
 from compressed_tensors.offload import dispatch_model
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer

 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier

 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"
 NUM_CALIBRATION_SAMPLES = 256
 MAX_SEQUENCE_LENGTH = 512


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--scheme", default="W4A16_ASYM")
    parser.add_argument("--save-dir", default=None)
    parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES)
    args = parser.parse_args()

    num_samples = args.num_samples

    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

    ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]")
    ds = ds.shuffle(seed=42)

    def preprocess(example):
        return {
            "text": tokenizer.apply_chat_template(
                example["messages"],
                tokenize=False,
            )
        }

    ds = ds.map(preprocess)

    def tokenize(sample):
        return tokenizer(
            sample["text"],
            padding=False,
            max_length=MAX_SEQUENCE_LENGTH,
            truncation=True,
            add_special_tokens=False,
        )

    ds = ds.map(tokenize, remove_columns=ds.column_names)

    recipe = [
        AWQModifier(
            ignore=["lm_head"],
            scheme=args.scheme,
            targets=["Linear"],
            duo_scaling="both",
        ),
    ]

    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()

    oneshot(
        model=model,
        dataset=ds,
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=num_samples,
    )

    elapsed_time = time.time() - start_time
    peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
    print("Quantization Complete")
    print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
    print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")

    print("\n\n")
    print("========== SAMPLE GENERATION ==============")
    dispatch_model(model)
    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
        model.device
    )
    output = model.generate(input_ids, max_new_tokens=100)
    print(tokenizer.decode(output[0]))
    print("==========================================\n\n")

    save_dir = args.save_dir or (
        MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}"
    )
    model.save_pretrained(save_dir, save_compressed=True)
    tokenizer.save_pretrained(save_dir)
    print(f"Model saved to {save_dir}")


 if __name__ == "__main__":
    main()
diff --git a/llama4_scout_awq.py b/llama4_scout_awq.py
 import argparse
 import time

 import torch
 from datasets import load_dataset
 from transformers import Llama4ForConditionalGeneration, Llama4Processor

 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
 from llmcompressor.modifiers.awq.mappings import AWQMapping

 MODEL_ID = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

 DATASET_ID = "neuralmagic/calibration"
 NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 8192


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--scheme", default="W4A16_ASYM")
    parser.add_argument("--save-dir", default=None)
    parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES)
    args = parser.parse_args()

    num_samples = args.num_samples

    model = Llama4ForConditionalGeneration.from_pretrained(MODEL_ID, dtype="auto")
    processor = Llama4Processor.from_pretrained(MODEL_ID)

    ds = load_dataset(
        DATASET_ID, name="LLM", split=f"train[:{num_samples}]"
    )

    def preprocess_function(example):
        messages = []
        for message in example["messages"]:
            messages.append(
                {
                    "role": message["role"],
                    "content": [{"type": "text", "text": message["content"]}],
                }
            )

        return processor.apply_chat_template(
            messages,
            return_tensors="pt",
            padding=False,
            truncation=True,
            max_length=MAX_SEQUENCE_LENGTH,
            tokenize=True,
            add_special_tokens=False,
            return_dict=True,
            add_generation_prompt=False,
        )

    ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)

    def data_collator(batch):
        assert len(batch) == 1
        return {
            key: (
                torch.tensor(value)
                if key != "pixel_values"
                else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
            )
            for key, value in batch[0].items()
        }

    # Llama-4-Scout has both vision_model and language_model sub-models,
    # so mappings must be scoped to language_model to avoid dual matches.
    # The main experts use a fused gate_up_proj (not Linear), so only
    # shared_expert Linear layers are AWQ targets.
    recipe = AWQModifier(
        targets="Linear",
        scheme=args.scheme,
        ignore=[
            "re:.*lm_head",
            "re:.*self_attn",
            "re:.*router",
            "re:.*vision_model.*",
            "re:.*multi_modal_projector.*",
            "Llama4TextAttention",
        ],
        mappings=[
            AWQMapping(
                "re:.*language_model.*post_attention_layernorm$",
                [
                    "re:.*shared_expert.gate_proj$",
                    "re:.*shared_expert.up_proj$",
                ],
            ),
            AWQMapping(
                "re:.*shared_expert.up_proj$",
                ["re:.*shared_expert.down_proj$"],
            ),
        ],
    )

    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()

    oneshot(
        model=model,
        dataset=ds,
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=num_samples,
        data_collator=data_collator,
        sequential_targets=["Llama4TextMLP"],
    )

    elapsed_time = time.time() - start_time
    peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
    print("Quantization Complete")
    print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
    print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")

    save_dir = args.save_dir or (
        MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}"
    )
    model.save_pretrained(save_dir, save_compressed=True)
    processor.save_pretrained(save_dir)
    print(f"Model saved to {save_dir}")


 if __name__ == "__main__":
    main()
diff --git a/mixtral_awq.py b/mixtral_awq.py
 import argparse
 import time

 import torch
 from compressed_tensors.offload import dispatch_model
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer

 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
 from llmcompressor.modifiers.awq.mappings import AWQMapping

 MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"

 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"
 NUM_CALIBRATION_SAMPLES = 256
 MAX_SEQUENCE_LENGTH = 512


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--scheme", default="W4A16_ASYM")
    parser.add_argument("--save-dir", default=None)
    parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES)
    args = parser.parse_args()

    num_samples = args.num_samples

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID, dtype=torch.bfloat16, trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

    ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]")
    ds = ds.shuffle(seed=42)

    def preprocess(example):
        return {
            "text": tokenizer.apply_chat_template(
                example["messages"],
                tokenize=False,
            )
        }

    ds = ds.map(preprocess)

    def tokenize(sample):
        return tokenizer(
            sample["text"],
            padding=False,
            max_length=MAX_SEQUENCE_LENGTH,
            truncation=True,
            add_special_tokens=False,
        )

    ds = ds.map(tokenize, remove_columns=ds.column_names)

    # Mixtral uses w1/w2/w3 naming for expert layers instead of
    # gate_proj/up_proj/down_proj, so we need custom mappings
    recipe = [
        AWQModifier(
            ignore=[
                "lm_head",
                "re:.*block_sparse_moe.gate",
            ],
            scheme=args.scheme,
            targets=["Linear"],
            duo_scaling="both",
            mappings=[
                AWQMapping(
                    "re:.*input_layernorm$",
                    ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"],
                ),
                AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]),
                AWQMapping(
                    "re:.*post_attention_layernorm$",
                    [
                        "re:.*block_sparse_moe.experts.*.w1$",
                        "re:.*block_sparse_moe.experts.*.w3$",
                    ],
                ),
                AWQMapping("re:.*w3$", ["re:.*w2$"]),
            ],
        ),
    ]

    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()

    oneshot(
        model=model,
        dataset=ds,
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=num_samples,
        trust_remote_code_model=True,
    )

    elapsed_time = time.time() - start_time
    peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
    print("Quantization Complete")
    print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
    print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")

    print("\n\n")
    print("========== SAMPLE GENERATION ==============")
    dispatch_model(model)
    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
        model.device
    )
    output = model.generate(input_ids, max_new_tokens=100)
    print(tokenizer.decode(output[0]))
    print("==========================================\n\n")

    save_dir = args.save_dir or (
        MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}"
    )
    model.save_pretrained(save_dir, save_compressed=True)
    tokenizer.save_pretrained(save_dir)
    print(f"Model saved to {save_dir}")


 if __name__ == "__main__":
    main()
diff --git a/qwen25_32b_awq.py b/qwen25_32b_awq.py
 import argparse
 import time

 import torch
 from compressed_tensors.offload import dispatch_model
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer

 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier

 MODEL_ID = "Qwen/Qwen2.5-32B-Instruct"

 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"
 NUM_CALIBRATION_SAMPLES = 256
 MAX_SEQUENCE_LENGTH = 512


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--scheme", default="W4A16_ASYM")
    parser.add_argument("--save-dir", default=None)
    parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES)
    args = parser.parse_args()

    num_samples = args.num_samples

    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

    ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]")
    ds = ds.shuffle(seed=42)

    def preprocess(example):
        return {
            "text": tokenizer.apply_chat_template(
                example["messages"],
                tokenize=False,
            )
        }

    ds = ds.map(preprocess)

    def tokenize(sample):
        return tokenizer(
            sample["text"],
            padding=False,
            max_length=MAX_SEQUENCE_LENGTH,
            truncation=True,
            add_special_tokens=False,
        )

    ds = ds.map(tokenize, remove_columns=ds.column_names)

    recipe = [
        AWQModifier(
            ignore=["lm_head"],
            scheme=args.scheme,
            targets=["Linear"],
            duo_scaling="both",
        ),
    ]

    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()

    oneshot(
        model=model,
        dataset=ds,
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=num_samples,
    )

    elapsed_time = time.time() - start_time
    peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
    print("Quantization Complete")
    print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
    print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")

    print("\n\n")
    print("========== SAMPLE GENERATION ==============")
    dispatch_model(model)
    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
        model.device
    )
    output = model.generate(input_ids, max_new_tokens=100)
    print(tokenizer.decode(output[0]))
    print("==========================================\n\n")

    save_dir = args.save_dir or (
        MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}"
    )
    model.save_pretrained(save_dir, save_compressed=True)
    tokenizer.save_pretrained(save_dir)
    print(f"Model saved to {save_dir}")


 if __name__ == "__main__":
    main()
diff --git a/qwen3_vl_awq.py b/qwen3_vl_awq.py
 import argparse
 import base64
 import time
 from io import BytesIO

 import torch
 from compressed_tensors.offload import dispatch_model
 from datasets import load_dataset
 from qwen_vl_utils import process_vision_info
 from transformers import AutoProcessor, Qwen3VLForConditionalGeneration

 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier

 MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"

 DATASET_ID = "lmms-lab/flickr30k"
 DATASET_SPLIT = "test"
 NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--scheme", default="W4A16_ASYM")
    parser.add_argument("--save-dir", default=None)
    parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES)
    args = parser.parse_args()

    num_samples = args.num_samples

    model = Qwen3VLForConditionalGeneration.from_pretrained(
        MODEL_ID, torch_dtype="auto"
    )
    processor = AutoProcessor.from_pretrained(MODEL_ID)

    ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]")
    ds = ds.shuffle(seed=42)

    def preprocess_and_tokenize(example):
        buffered = BytesIO()
        example["image"].save(buffered, format="PNG")
        encoded_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
        base64_qwen = f"data:image;base64,{encoded_image}"
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": base64_qwen},
                    {"type": "text", "text": "What does the image show?"},
                ],
            }
        ]
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)

        return processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=False,
            max_length=MAX_SEQUENCE_LENGTH,
            truncation=True,
        )

    ds = ds.map(preprocess_and_tokenize, remove_columns=ds.column_names)

    def data_collator(batch):
        assert len(batch) == 1
        return {key: torch.tensor(value) for key, value in batch[0].items()}

    recipe = AWQModifier(
        scheme=args.scheme,
        ignore=["re:.*lm_head", "re:.*visual.*"],
        duo_scaling=False,
    )

    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()

    oneshot(
        model=model,
        tokenizer=MODEL_ID,
        dataset=ds,
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=num_samples,
        data_collator=data_collator,
        sequential_targets=["Qwen3VLTextDecoderLayer"],
    )

    elapsed_time = time.time() - start_time
    peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
    print("Quantization Complete")
    print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
    print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")

    print("\n\n")
    print("========== SAMPLE GENERATION ==============")
    dispatch_model(model)
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": "http://images.cocodataset.org/train2017/000000231895.jpg",
                },
                {"type": "text", "text": "Please describe the animal in this image\n"},
            ],
        }
    ]
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[prompt],
        images=image_inputs,
        videos=video_inputs,
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        return_tensors="pt",
    ).to(model.device)
    output = model.generate(**inputs, max_new_tokens=100)
    print(processor.decode(output[0], skip_special_tokens=True))
    print("==========================================\n\n")

    save_dir = args.save_dir or (
        MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}"
    )
    model.save_pretrained(save_dir, save_compressed=True)
    processor.save_pretrained(save_dir)
    print(f"Model saved to {save_dir}")


 if __name__ == "__main__":
    main()
diff --git a/run_all_tests.sh b/run_all_tests.sh
 #!/bin/bash
 # AWQ DDP Regression Test Suite
 # Compares pre-DDP vs post-DDP AWQ quality across models, formats, and benchmarks.
 #
 # Usage:
 #   ./run_all_tests.sh 2>&1 | tee regression_results.log
 #   python extract_log_summary.py regression_results.log
 #
 # Models are saved to disk and NOT cleaned up for follow-up evaluation.

 set -o pipefail

 # Avoid permission errors on shared HF cache files
 export HF_DATASETS_CACHE="/tmp/hf_datasets_cache"
 mkdir -p "$HF_DATASETS_CACHE"

 # ── Configuration ────────────────────────────────────────────────────────────

 PRE_DDP_COMMIT="2ab02443"
 POST_DDP_COMMIT="0bc916e5"

 # Backup regression test scripts (they don't exist in old commits)
 BACKUP_DIR="/tmp/awq_regression_tests_backup"
 mkdir -p "$BACKUP_DIR"
 cp -r examples/awq/regression_tests/* "$BACKUP_DIR/" 2>/dev/null || true

 SCRIPTS=(
    "examples/awq/regression_tests/llama3_awq.py"
    "examples/awq/regression_tests/qwen3_vl_awq.py"
    "examples/awq/regression_tests/llama4_scout_awq.py"
    "examples/awq/regression_tests/qwen25_32b_awq.py"
    "examples/awq/regression_tests/mixtral_awq.py"
 )

 MODEL_SHORT_NAMES=(
    "Meta-Llama-3-8B-Instruct"
    "Qwen3-VL-8B-Instruct"
    "Llama-4-Scout-17B-16E-Instruct"
    "Qwen2.5-32B-Instruct"
    "Mixtral-8x7B-Instruct-v0.1"
 )

 # vLLM eval settings per model: max_model_len,tensor_parallel_size,num_gpus_quant
 MODEL_VLLM_ARGS=(
    "2048,1,1"
    "4096,1,1"
    "4096,2,1"
    "4096,2,1"
    "2048,2,1"
 )

 SCHEMES=("W4A16_ASYM" "W8A8" "W8A16")

 # eval_name      lm_eval_task    fewshot  backend
 # gsm8k          gsm8k           5        vllm
 # gsm8k_platinum gsm8k_platinum  5        vllm
 # wikitext       wikitext        0        vllm
 # mmlu           mmlu            5        vllm
 EVAL_NAMES=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu")
 EVAL_LM_TASKS=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu")
 EVAL_FEWSHOT=("5" "5" "0" "5")
 EVAL_BACKENDS=("vllm" "vllm" "vllm" "vllm")

 CODE_STATES=("pre-ddp" "post-ddp")

 EVAL_BASE_DIR="./eval_results"
 MODEL_BASE_DIR="./regression_models"
 RESULTS_CSV="regression_results.csv"

 mkdir -p "$EVAL_BASE_DIR" "$MODEL_BASE_DIR"

 # ── Helper: activate environments ────────────────────────────────────────────

 activate_quant_env() {
    source /home/HDCharles/rhdev/bin/activate
 }

 activate_eval_env() {
    source /home/HDCharles/vllm/bin/activate
 }

 # ── Helper: run vLLM evaluation with fallback chain ──────────────────────────

 EVAL_BACKEND=""  # set by run_vllm_eval to indicate which backend succeeded

 run_vllm_eval() {
    local save_dir=$1
    local task=$2
    local num_fewshot=$3
    local max_model_len=$4
    local tp_size=$5
    local eval_output_dir=$6

    mkdir -p "$eval_output_dir"
    EVAL_BACKEND="FAILED"

    activate_eval_env

    echo "  EVAL: $task (fewshot=$num_fewshot, tp=$tp_size, max_len=$max_model_len)"

    # Build common eval flags
    local chat_args="--apply_chat_template"
    if [ "$num_fewshot" -gt 0 ]; then
        chat_args="$chat_args --fewshot_as_multiturn"
    fi

    # Try with tensor_parallel
    if [ "$tp_size" -gt 1 ]; then
        lm_eval \
            --model vllm \
            --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,tensor_parallel_size=$tp_size,gpu_memory_utilization=0.85" \
            --tasks "$task" \
            --num_fewshot "$num_fewshot" \
            --batch_size auto \
            $chat_args \
            --output_path "$eval_output_dir" 2>&1
        if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp${tp_size}"; return 0; fi

        echo "  TP=$tp_size failed, trying expert_parallel..."
        lm_eval \
            --model vllm \
            --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enable_expert_parallel=True,gpu_memory_utilization=0.85" \
            --tasks "$task" \
            --num_fewshot "$num_fewshot" \
            --batch_size auto \
            $chat_args \
            --output_path "$eval_output_dir" 2>&1
        if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_expert_parallel"; return 0; fi
    fi

    echo "  Trying TP=1..."
    lm_eval \
        --model vllm \
        --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,gpu_memory_utilization=0.85" \
        --tasks "$task" \
        --num_fewshot "$num_fewshot" \
        --batch_size auto \
        $chat_args \
        --output_path "$eval_output_dir" 2>&1
    if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp1"; return 0; fi

    echo "  Trying enforce_eager..."
    lm_eval \
        --model vllm \
        --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enforce_eager=True,gpu_memory_utilization=0.85" \
        --tasks "$task" \
        --num_fewshot "$num_fewshot" \
        --batch_size auto \
        $chat_args \
        --output_path "$eval_output_dir" 2>&1
    if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_eager"; return 0; fi

    echo "  Trying hf backend as last resort..."
    lm_eval \
        --model hf \
        --model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
        --tasks "$task" \
        --num_fewshot "$num_fewshot" \
        --batch_size auto \
        $chat_args \
        --output_path "$eval_output_dir" 2>&1
    if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi

    EVAL_BACKEND="FAILED"
    return 1
 }

 # ── Helper: run HF-only evaluation ─────────────────────────────────────────

 run_hf_eval() {
    local save_dir=$1
    local task=$2
    local num_fewshot=$3
    local eval_output_dir=$4

    mkdir -p "$eval_output_dir"
    EVAL_BACKEND="FAILED"

    activate_eval_env

    echo "  EVAL: $task (fewshot=$num_fewshot, backend=hf)"

    # Build common eval flags
    local chat_args="--apply_chat_template"
    if [ "$num_fewshot" -gt 0 ]; then
        chat_args="$chat_args --fewshot_as_multiturn"
    fi

    lm_eval \
        --model hf \
        --model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
        --tasks "$task" \
        --num_fewshot "$num_fewshot" \
        --batch_size auto \
        $chat_args \
        --output_path "$eval_output_dir" 2>&1
    if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi

    EVAL_BACKEND="FAILED"
    return 1
 }

 # ── Helper: extract metric from lm_eval JSON results ────────────────────────

 extract_metric() {
    local eval_output_dir=$1
    local task=$2

    # Find the most recent results JSON in the eval output dir
    local results_json
    results_json=$(find "$eval_output_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1)

    if [ -z "$results_json" ]; then
        echo "N/A"
        return
    fi

    python3 -c "
 import json, sys
 with open('$results_json') as f:
    data = json.load(f)
 results = data.get('results', {})
 task = '$task'

 # Handle task name variations (e.g., wikitext vs wikitext2)
 task_results = None
 for key in results:
    if task in key:
        task_results = results[key]
        break

 if task_results is None:
    print('N/A')
    sys.exit()

 # Extract the primary metric for each task
 if 'gsm8k' in task:
    val = task_results.get('exact_match,flexible-extract')
    if val is not None:
        print(f'{val*100:.2f}%')
    else:
        print('N/A')
 elif 'wikitext' in task:
    val = task_results.get('word_perplexity,none')
    if val is not None:
        print(f'{val:.2f}')
    else:
        print('N/A')
 elif 'mmlu' in task:
    val = task_results.get('acc,none')
    if val is not None:
        print(f'{val*100:.2f}%')
    else:
        print('N/A')
 else:
    # Generic: grab first non-stderr, non-alias metric
    for k, v in task_results.items():
        if 'stderr' not in k and k != 'alias' and isinstance(v, (int, float)):
            print(f'{v:.4f}')
            sys.exit()
    print('N/A')
 " 2>/dev/null || echo "N/A"
 }

 # ── Helper: switch code state ────────────────────────────────────────────────

 switch_code_state() {
    local state=$1

    activate_quant_env

    if [ "$state" == "pre-ddp" ]; then
        echo "Switching to pre-DDP code ($PRE_DDP_COMMIT)..."
        git checkout "$PRE_DDP_COMMIT" 2>&1
        # Restore test scripts as untracked files
        mkdir -p examples/awq/regression_tests
        cp -r "$BACKUP_DIR"/* examples/awq/regression_tests/ 2>/dev/null || true
        pip install -e . --quiet 2>&1
    elif [ "$state" == "post-ddp" ]; then
        echo "Switching to post-DDP code ($POST_DDP_COMMIT)..."
        git checkout "$POST_DDP_COMMIT" 2>&1
        # Restore test scripts as untracked files
        mkdir -p examples/awq/regression_tests
        cp -r "$BACKUP_DIR"/* examples/awq/regression_tests/ 2>/dev/null || true
        pip install -e . --quiet 2>&1
    fi
 }

 # ── Helper: print current results summary ────────────────────────────────────

 print_summary() {
    echo ""
    echo "╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗"
    echo "║  RESULTS SUMMARY (so far)                                                                         ║"
    echo "╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝"
    echo ""
    if [ -f "$RESULTS_CSV" ]; then
        # Print header + all rows as a formatted table
        column -t -s',' < "$RESULTS_CSV"
    else
        echo "(no results yet)"
    fi
    echo ""
    echo "════════════════════════════════════════════════════════════════════════════════════════════════════════"
    echo ""
 }

 # ── Initialize results CSV (preserve previous results) ──────────────────────

 if [ -f "$RESULTS_CSV" ]; then
    cp "$RESULTS_CSV" "${RESULTS_CSV}.bak"
 fi
 echo "model,scheme,code_state,task,metric,status,eval_backend,save_dir" > "$RESULTS_CSV"

 # ── Main loop ────────────────────────────────────────────────────────────────

 TOTAL=0
 PASSED=0
 FAILED=0

 for model_idx in "${!SCRIPTS[@]}"; do
    script="${SCRIPTS[$model_idx]}"
    model_name="${MODEL_SHORT_NAMES[$model_idx]}"
    IFS=',' read -r max_model_len tp_size num_gpus_quant <<< "${MODEL_VLLM_ARGS[$model_idx]}"

    for scheme in "${SCHEMES[@]}"; do
        for code_state in "${CODE_STATES[@]}"; do

            save_dir="$MODEL_BASE_DIR/${model_name}-${scheme}-${code_state}"

            echo ""
            echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
            echo "║  MODEL: $model_name"
            echo "║  SCHEME: $scheme"
            echo "║  CODE STATE: $code_state"
            echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
            echo ""

            # ── Skip entirely if all evals already have results ────
            all_evals_cached=true
            for eval_idx in "${!EVAL_NAMES[@]}"; do
                eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${code_state}/${EVAL_NAMES[$eval_idx]}"
                if ! find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | grep -q .; then
                    all_evals_cached=false
                    break
                fi
            done
            if [ "$all_evals_cached" = true ]; then
                echo "All evals already cached, skipping quantization and eval."
                for eval_idx in "${!EVAL_NAMES[@]}"; do
                    eval_name="${EVAL_NAMES[$eval_idx]}"
                    lm_task="${EVAL_LM_TASKS[$eval_idx]}"
                    eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${code_state}/${eval_name}"
                    metric_val=$(extract_metric "$eval_dir" "$lm_task")
                    echo "  $eval_name: $metric_val"
                    echo "$model_name,$scheme,$code_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
                    PASSED=$((PASSED + 1))
                    TOTAL=$((TOTAL + 1))
                done
                print_summary
                continue
            fi

            # ── Quantize (skip if model already exists) ────────────
            if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then
                echo "Quantized model already exists at $save_dir, skipping quantization."
            else
                # Switch code state
                switch_code_state "$code_state"

                echo "============================================"
                echo "Running: $script --scheme $scheme"
                echo "============================================"

                if [ "$num_gpus_quant" -gt 1 ]; then
                    torchrun --nproc_per_node="$num_gpus_quant" "$script" \
                        --scheme "$scheme" --save-dir "$save_dir" 2>&1
                else
                    python "$script" --scheme "$scheme" --save-dir "$save_dir" 2>&1
                fi

                quant_status=$?
                if [ $quant_status -ne 0 ]; then
                    echo "QUANTIZATION FAILED for $model_name / $scheme / $code_state"
                    for eval_name in "${EVAL_NAMES[@]}"; do
                        echo "$model_name,$scheme,$code_state,$eval_name,N/A,QUANT_FAILED,N/A,$save_dir" >> "$RESULTS_CSV"
                    done
                    FAILED=$((FAILED + ${#EVAL_NAMES[@]}))
                    TOTAL=$((TOTAL + ${#EVAL_NAMES[@]}))
                    print_summary
                    continue
                fi
            fi

            # ── Clear GPU memory before eval ─────────────────────────
            python3 -c "import torch; torch.cuda.empty_cache(); [torch.cuda.reset_peak_memory_stats(i) for i in range(torch.cuda.device_count())]" 2>/dev/null

            # ── Evaluate ─────────────────────────────────────────────
            for eval_idx in "${!EVAL_NAMES[@]}"; do
                eval_name="${EVAL_NAMES[$eval_idx]}"
                lm_task="${EVAL_LM_TASKS[$eval_idx]}"
                fewshot="${EVAL_FEWSHOT[$eval_idx]}"
                backend="${EVAL_BACKENDS[$eval_idx]}"
                eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${code_state}/${eval_name}"

                TOTAL=$((TOTAL + 1))

                # Skip eval if results already exist
                existing_result=$(find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1)
                if [ -n "$existing_result" ]; then
                    metric_val=$(extract_metric "$eval_dir" "$lm_task")
                    echo "  EVAL: $eval_name — skipping, previous result found: $metric_val"
                    echo "$model_name,$scheme,$code_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
                    PASSED=$((PASSED + 1))
                    continue
                fi

                if [ "$backend" == "hf" ]; then
                    run_hf_eval "$save_dir" "$lm_task" "$fewshot" "$eval_dir"
                else
                    run_vllm_eval "$save_dir" "$lm_task" "$fewshot" "$max_model_len" "$tp_size" "$eval_dir"
                fi
                eval_status=$?

                if [ $eval_status -eq 0 ]; then
                    metric_val=$(extract_metric "$eval_dir" "$lm_task")
                    echo "$model_name,$scheme,$code_state,$eval_name,$metric_val,PASSED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
                    PASSED=$((PASSED + 1))
                else
                    echo "$model_name,$scheme,$code_state,$eval_name,N/A,FAILED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
                    FAILED=$((FAILED + 1))
                fi
            done

            # ── Clean up model to free disk space ────────────────────
            if [ -d "$save_dir" ]; then
                echo "Removing quantized model at $save_dir to free disk space."
                rm -rf "$save_dir"
            fi

            print_summary

        done  # code_state
    done  # scheme
 done  # model

 # ── Final Summary ────────────────────────────────────────────────────────────

 echo ""
 echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
 echo "║  FINAL SUMMARY: $PASSED passed, $FAILED failed out of $TOTAL total evaluations         ║"
 echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
 echo ""
 print_summary
 echo "Results CSV: $RESULTS_CSV"
 echo "Saved models: $MODEL_BASE_DIR/"
 echo "Eval outputs: $EVAL_BASE_DIR/"
 echo ""
 echo "To extract detailed metrics from the log:"
 echo "  python extract_log_summary.py regression_results.log"
	#!/usr/bin/env python3
	"""Extract summary data from AWQ DDP regression test log files.

	Parses the log output from run_all_tests.sh and produces a comparison table
	showing pre-DDP vs post-DDP results across models, schemes, and benchmarks.

	Usage:
	python extract_log_summary.py regression_results.log
	"""

	import re
	import sys
	from collections import defaultdict


	def extract_log_summary(log_path):
	with open(log_path, "r") as f:
	content = f.read()

	# Split into sections by the box-drawing delimiters
	section_pattern = re.compile(
	r"║\s+MODEL:\s+(.+?)\n"
	r"\s*║\s+SCHEME:\s+(.+?)\n"
	r"\s*║\s+CODE STATE:\s+(.+?)\n"
	r".*?"
	r"(?=║\s+MODEL:\|╔══.*FINAL SUMMARY\|\Z)",
	re.DOTALL,
	)

	sections = section_pattern.findall(content)

	# results[model][scheme][code_state] = {task: {metric: value}}
	results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
	timing = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

	for model, scheme, code_state in sections:
	model = model.strip()
	scheme = scheme.strip()
	code_state = code_state.strip()

	# Find the body for this section
	pattern = re.escape(f"MODEL: {model}") + r".*?" + re.escape(f"CODE STATE: {code_state}")
	match = re.search(pattern, content, re.DOTALL)
	if not match:
	continue
	# Get everything after the match until the next section
	start = match.end()
	next_section = re.search(r"║\s+MODEL:", content[start:])
	end = start + next_section.start() if next_section else len(content)
	body = content[start:end]

	# Extract timing
	time_match = re.search(
	r"Time:\s([\d.]+)\sminutes\s\(([\d.]+)\sseconds\)", body
	)
	if time_match:
	timing[model][scheme][code_state]["time_min"] = float(
	time_match.group(1)
	)

	gpu_match = re.search(r"Peak GPU Memory:\s([\d.]+)\sGB", body)
	if gpu_match:
	timing[model][scheme][code_state]["gpu_gb"] = float(gpu_match.group(1))

	# Extract GSM8K flexible-extract
	flex_match = re.search(
	r"flexible-extract\\|.?\\|.?\\|.*?\\|([\d.]+)\\|", body
	)
	if flex_match:
	results[model][scheme][code_state]["gsm8k_flex"] = float(
	flex_match.group(1)
	)

	# Extract GSM8K strict-match
	strict_match = re.search(
	r"strict-match\s\\|.?\\|.?\\|.?\\|([\d.]+)\\|", body
	)
	if strict_match:
	results[model][scheme][code_state]["gsm8k_strict"] = float(
	strict_match.group(1)
	)

	# Extract wikitext word_perplexity
	ppl_match = re.search(
	r"word_perplexity\s\\|.?\\|.?\\|.?\\|([\d.]+)\\|", body
	)
	if ppl_match:
	results[model][scheme][code_state]["wikitext_ppl"] = float(
	ppl_match.group(1)
	)

	# Extract MMLU accuracy
	mmlu_match = re.search(
	r"acc\s\\|.?\\|.?\\|.?\\|([\d.]+)\\|", body
	)
	if mmlu_match:
	results[model][scheme][code_state]["mmlu_acc"] = float(
	mmlu_match.group(1)
	)

	# Print comparison table
	print(f"\nLog: {log_path}\n")

	metrics = ["gsm8k_flex", "gsm8k_strict", "wikitext_ppl", "mmlu_acc"]
	metric_labels = ["GSM8K Flex", "GSM8K Strict", "Wiki PPL", "MMLU Acc"]

	header = (
	f"{'Model':<35} {'Scheme':<12} {'State':<10} "
	f"{'Time':>7} {'GPU':>6} "
	+ " ".join(f"{m:>12}" for m in metric_labels)
	)
	print(header)
	print("-" * len(header))

	for model in sorted(results.keys()):
	for scheme in sorted(results[model].keys()):
	for code_state in ["pre-ddp", "post-ddp"]:
	r = results[model][scheme].get(code_state, {})
	t = timing[model][scheme].get(code_state, {})

	time_str = (
	f"{t['time_min']:.1f}m" if "time_min" in t else "N/A"
	)
	gpu_str = (
	f"{t['gpu_gb']:.1f}G" if "gpu_gb" in t else "N/A"
	)

	vals = []
	for m in metrics:
	if m in r:
	vals.append(f"{r[m]:.4f}")
	else:
	vals.append("N/A")

	print(
	f"{model:<35} {scheme:<12} {code_state:<10} "
	f"{time_str:>7} {gpu_str:>6} "
	+ " ".join(f"{v:>12}" for v in vals)
	)

	# Print delta row
	pre = results[model][scheme].get("pre-ddp", {})
	post = results[model][scheme].get("post-ddp", {})
	deltas = []
	for m in metrics:
	if m in pre and m in post:
	diff = post[m] - pre[m]
	sign = "+" if diff >= 0 else ""
	# For perplexity, lower is better so flip the sign indicator
	if m == "wikitext_ppl":
	indicator = " *" if diff > 0 else ""
	else:
	indicator = " *" if diff < -0.01 else ""
	deltas.append(f"{sign}{diff:.4f}{indicator}")
	else:
	deltas.append("---")

	print(
	f"{'':35} {'':12} {'delta':<10} "
	f"{'':>7} {'':>6} "
	+ " ".join(f"{d:>12}" for d in deltas)
	)
	print()


	if __name__ == "__main__":
	log_path = sys.argv[1] if len(sys.argv) > 1 else "regression_results.log"
	extract_log_summary(log_path)
	import argparse
	import time

	import torch
	from compressed_tensors.offload import dispatch_model
	from datasets import load_dataset
	from transformers import AutoModelForCausalLM, AutoTokenizer

	from llmcompressor import oneshot
	from llmcompressor.modifiers.awq import AWQModifier

	MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

	DATASET_ID = "HuggingFaceH4/ultrachat_200k"
	DATASET_SPLIT = "train_sft"
	NUM_CALIBRATION_SAMPLES = 256
	MAX_SEQUENCE_LENGTH = 512


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--scheme", default="W4A16_ASYM")
	parser.add_argument("--save-dir", default=None)
	parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES)
	args = parser.parse_args()

	num_samples = args.num_samples

	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

	ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]")
	ds = ds.shuffle(seed=42)

	def preprocess(example):
	return {
	"text": tokenizer.apply_chat_template(
	example["messages"],
	tokenize=False,
	)
	}

	ds = ds.map(preprocess)

	def tokenize(sample):
	return tokenizer(
	sample["text"],
	padding=False,
	max_length=MAX_SEQUENCE_LENGTH,
	truncation=True,
	add_special_tokens=False,
	)

	ds = ds.map(tokenize, remove_columns=ds.column_names)

	recipe = [
	AWQModifier(
	ignore=["lm_head"],
	scheme=args.scheme,
	targets=["Linear"],
	duo_scaling="both",
	),
	]

	torch.cuda.reset_peak_memory_stats()
	start_time = time.time()

	oneshot(
	model=model,
	dataset=ds,
	recipe=recipe,
	max_seq_length=MAX_SEQUENCE_LENGTH,
	num_calibration_samples=num_samples,
	)

	elapsed_time = time.time() - start_time
	peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
	print("Quantization Complete")
	print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
	print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")

	print("\n\n")
	print("========== SAMPLE GENERATION ==============")
	dispatch_model(model)
	input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
	model.device
	)
	output = model.generate(input_ids, max_new_tokens=100)
	print(tokenizer.decode(output[0]))
	print("==========================================\n\n")

	save_dir = args.save_dir or (
	MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}"
	)
	model.save_pretrained(save_dir, save_compressed=True)
	tokenizer.save_pretrained(save_dir)
	print(f"Model saved to {save_dir}")


	if __name__ == "__main__":
	main()
	import argparse
	import base64
	import time
	from io import BytesIO

	import torch
	from compressed_tensors.offload import dispatch_model
	from datasets import load_dataset
	from qwen_vl_utils import process_vision_info
	from transformers import AutoProcessor, Qwen3VLForConditionalGeneration

	from llmcompressor import oneshot
	from llmcompressor.modifiers.awq import AWQModifier

	MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"

	DATASET_ID = "lmms-lab/flickr30k"
	DATASET_SPLIT = "test"
	NUM_CALIBRATION_SAMPLES = 512
	MAX_SEQUENCE_LENGTH = 2048


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--scheme", default="W4A16_ASYM")
	parser.add_argument("--save-dir", default=None)
	parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES)
	args = parser.parse_args()

	num_samples = args.num_samples

	model = Qwen3VLForConditionalGeneration.from_pretrained(
	MODEL_ID, torch_dtype="auto"
	)
	processor = AutoProcessor.from_pretrained(MODEL_ID)

	ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]")
	ds = ds.shuffle(seed=42)

	def preprocess_and_tokenize(example):
	buffered = BytesIO()
	example["image"].save(buffered, format="PNG")
	encoded_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
	base64_qwen = f"data:image;base64,{encoded_image}"
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": base64_qwen},
	{"type": "text", "text": "What does the image show?"},
	],
	}
	]
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)

	return processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=False,
	max_length=MAX_SEQUENCE_LENGTH,
	truncation=True,
	)

	ds = ds.map(preprocess_and_tokenize, remove_columns=ds.column_names)

	def data_collator(batch):
	assert len(batch) == 1
	return {key: torch.tensor(value) for key, value in batch[0].items()}

	recipe = AWQModifier(
	scheme=args.scheme,
	ignore=["re:.lm_head", "re:.visual.*"],
	duo_scaling=False,
	)

	torch.cuda.reset_peak_memory_stats()
	start_time = time.time()

	oneshot(
	model=model,
	tokenizer=MODEL_ID,
	dataset=ds,
	recipe=recipe,
	max_seq_length=MAX_SEQUENCE_LENGTH,
	num_calibration_samples=num_samples,
	data_collator=data_collator,
	sequential_targets=["Qwen3VLTextDecoderLayer"],
	)

	elapsed_time = time.time() - start_time
	peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
	print("Quantization Complete")
	print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
	print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")

	print("\n\n")
	print("========== SAMPLE GENERATION ==============")
	dispatch_model(model)
	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": "http://images.cocodataset.org/train2017/000000231895.jpg",
	},
	{"type": "text", "text": "Please describe the animal in this image\n"},
	],
	}
	]
	prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[prompt],
	images=image_inputs,
	videos=video_inputs,
	padding=False,
	max_length=MAX_SEQUENCE_LENGTH,
	truncation=True,
	return_tensors="pt",
	).to(model.device)
	output = model.generate(**inputs, max_new_tokens=100)
	print(processor.decode(output[0], skip_special_tokens=True))
	print("==========================================\n\n")

	save_dir = args.save_dir or (
	MODEL_ID.rstrip("/").split("/")[-1] + f"-{args.scheme}"
	)
	model.save_pretrained(save_dir, save_compressed=True)
	processor.save_pretrained(save_dir)
	print(f"Model saved to {save_dir}")


	if __name__ == "__main__":
	main()
	#!/bin/bash
	# AWQ DDP Regression Test Suite
	# Compares pre-DDP vs post-DDP AWQ quality across models, formats, and benchmarks.
	#
	# Usage:
	# ./run_all_tests.sh 2>&1 \| tee regression_results.log
	# python extract_log_summary.py regression_results.log
	#
	# Models are saved to disk and NOT cleaned up for follow-up evaluation.

	set -o pipefail

	# Avoid permission errors on shared HF cache files
	export HF_DATASETS_CACHE="/tmp/hf_datasets_cache"
	mkdir -p "$HF_DATASETS_CACHE"

	# ── Configuration ────────────────────────────────────────────────────────────

	PRE_DDP_COMMIT="2ab02443"
	POST_DDP_COMMIT="0bc916e5"

	# Backup regression test scripts (they don't exist in old commits)
	BACKUP_DIR="/tmp/awq_regression_tests_backup"
	mkdir -p "$BACKUP_DIR"
	cp -r examples/awq/regression_tests/* "$BACKUP_DIR/" 2>/dev/null \|\| true

	SCRIPTS=(
	"examples/awq/regression_tests/llama3_awq.py"
	"examples/awq/regression_tests/qwen3_vl_awq.py"
	"examples/awq/regression_tests/llama4_scout_awq.py"
	"examples/awq/regression_tests/qwen25_32b_awq.py"
	"examples/awq/regression_tests/mixtral_awq.py"
	)

	MODEL_SHORT_NAMES=(
	"Meta-Llama-3-8B-Instruct"
	"Qwen3-VL-8B-Instruct"
	"Llama-4-Scout-17B-16E-Instruct"
	"Qwen2.5-32B-Instruct"
	"Mixtral-8x7B-Instruct-v0.1"
	)

	# vLLM eval settings per model: max_model_len,tensor_parallel_size,num_gpus_quant
	MODEL_VLLM_ARGS=(
	"2048,1,1"
	"4096,1,1"
	"4096,2,1"
	"4096,2,1"
	"2048,2,1"
	)

	SCHEMES=("W4A16_ASYM" "W8A8" "W8A16")

	# eval_name lm_eval_task fewshot backend
	# gsm8k gsm8k 5 vllm
	# gsm8k_platinum gsm8k_platinum 5 vllm
	# wikitext wikitext 0 vllm
	# mmlu mmlu 5 vllm
	EVAL_NAMES=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu")
	EVAL_LM_TASKS=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu")
	EVAL_FEWSHOT=("5" "5" "0" "5")
	EVAL_BACKENDS=("vllm" "vllm" "vllm" "vllm")

	CODE_STATES=("pre-ddp" "post-ddp")

	EVAL_BASE_DIR="./eval_results"
	MODEL_BASE_DIR="./regression_models"
	RESULTS_CSV="regression_results.csv"

	mkdir -p "$EVAL_BASE_DIR" "$MODEL_BASE_DIR"

	# ── Helper: activate environments ────────────────────────────────────────────

	activate_quant_env() {
	source /home/HDCharles/rhdev/bin/activate
	}

	activate_eval_env() {
	source /home/HDCharles/vllm/bin/activate
	}

	# ── Helper: run vLLM evaluation with fallback chain ──────────────────────────

	EVAL_BACKEND="" # set by run_vllm_eval to indicate which backend succeeded

	run_vllm_eval() {
	local save_dir=$1
	local task=$2
	local num_fewshot=$3
	local max_model_len=$4
	local tp_size=$5
	local eval_output_dir=$6

	mkdir -p "$eval_output_dir"
	EVAL_BACKEND="FAILED"

	activate_eval_env

	echo " EVAL: $task (fewshot=$num_fewshot, tp=$tp_size, max_len=$max_model_len)"

	# Build common eval flags
	local chat_args="--apply_chat_template"
	if [ "$num_fewshot" -gt 0 ]; then
	chat_args="$chat_args --fewshot_as_multiturn"
	fi

	# Try with tensor_parallel
	if [ "$tp_size" -gt 1 ]; then
	lm_eval \
	--model vllm \
	--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,tensor_parallel_size=$tp_size,gpu_memory_utilization=0.85" \
	--tasks "$task" \
	--num_fewshot "$num_fewshot" \
	--batch_size auto \
	$chat_args \
	--output_path "$eval_output_dir" 2>&1
	if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp${tp_size}"; return 0; fi

	echo " TP=$tp_size failed, trying expert_parallel..."
	lm_eval \
	--model vllm \
	--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enable_expert_parallel=True,gpu_memory_utilization=0.85" \
	--tasks "$task" \
	--num_fewshot "$num_fewshot" \
	--batch_size auto \
	$chat_args \
	--output_path "$eval_output_dir" 2>&1
	if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_expert_parallel"; return 0; fi
	fi

	echo " Trying TP=1..."
	lm_eval \
	--model vllm \
	--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,gpu_memory_utilization=0.85" \
	--tasks "$task" \
	--num_fewshot "$num_fewshot" \
	--batch_size auto \
	$chat_args \
	--output_path "$eval_output_dir" 2>&1
	if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp1"; return 0; fi

	echo " Trying enforce_eager..."
	lm_eval \
	--model vllm \
	--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enforce_eager=True,gpu_memory_utilization=0.85" \
	--tasks "$task" \
	--num_fewshot "$num_fewshot" \
	--batch_size auto \
	$chat_args \
	--output_path "$eval_output_dir" 2>&1
	if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_eager"; return 0; fi

	echo " Trying hf backend as last resort..."
	lm_eval \
	--model hf \
	--model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
	--tasks "$task" \
	--num_fewshot "$num_fewshot" \
	--batch_size auto \
	$chat_args \
	--output_path "$eval_output_dir" 2>&1
	if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi

	EVAL_BACKEND="FAILED"
	return 1
	}

	# ── Helper: run HF-only evaluation ─────────────────────────────────────────

	run_hf_eval() {
	local save_dir=$1
	local task=$2
	local num_fewshot=$3
	local eval_output_dir=$4

	mkdir -p "$eval_output_dir"
	EVAL_BACKEND="FAILED"

	activate_eval_env

	echo " EVAL: $task (fewshot=$num_fewshot, backend=hf)"

	# Build common eval flags
	local chat_args="--apply_chat_template"
	if [ "$num_fewshot" -gt 0 ]; then
	chat_args="$chat_args --fewshot_as_multiturn"
	fi

	lm_eval \
	--model hf \
	--model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
	--tasks "$task" \
	--num_fewshot "$num_fewshot" \
	--batch_size auto \
	$chat_args \
	--output_path "$eval_output_dir" 2>&1
	if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi

	EVAL_BACKEND="FAILED"
	return 1
	}

	# ── Helper: extract metric from lm_eval JSON results ────────────────────────

	extract_metric() {
	local eval_output_dir=$1
	local task=$2

	# Find the most recent results JSON in the eval output dir
	local results_json
	results_json=$(find "$eval_output_dir" -name "results_*.json" -type f 2>/dev/null \| sort \| tail -1)

	if [ -z "$results_json" ]; then
	echo "N/A"
	return
	fi

	python3 -c "
	import json, sys
	with open('$results_json') as f:
	data = json.load(f)
	results = data.get('results', {})
	task = '$task'

	# Handle task name variations (e.g., wikitext vs wikitext2)
	task_results = None
	for key in results:
	if task in key:
	task_results = results[key]
	break

	if task_results is None:
	print('N/A')
	sys.exit()

	# Extract the primary metric for each task
	if 'gsm8k' in task:
	val = task_results.get('exact_match,flexible-extract')
	if val is not None:
	print(f'{val*100:.2f}%')
	else:
	print('N/A')
	elif 'wikitext' in task:
	val = task_results.get('word_perplexity,none')
	if val is not None:
	print(f'{val:.2f}')
	else:
	print('N/A')
	elif 'mmlu' in task:
	val = task_results.get('acc,none')
	if val is not None:
	print(f'{val*100:.2f}%')
	else:
	print('N/A')
	else:
	# Generic: grab first non-stderr, non-alias metric
	for k, v in task_results.items():
	if 'stderr' not in k and k != 'alias' and isinstance(v, (int, float)):
	print(f'{v:.4f}')
	sys.exit()
	print('N/A')
	" 2>/dev/null \|\| echo "N/A"
	}

	# ── Helper: switch code state ────────────────────────────────────────────────

	switch_code_state() {
	local state=$1

	activate_quant_env

	if [ "$state" == "pre-ddp" ]; then
	echo "Switching to pre-DDP code ($PRE_DDP_COMMIT)..."
	git checkout "$PRE_DDP_COMMIT" 2>&1
	# Restore test scripts as untracked files
	mkdir -p examples/awq/regression_tests
	cp -r "$BACKUP_DIR"/* examples/awq/regression_tests/ 2>/dev/null \|\| true
	pip install -e . --quiet 2>&1
	elif [ "$state" == "post-ddp" ]; then
	echo "Switching to post-DDP code ($POST_DDP_COMMIT)..."
	git checkout "$POST_DDP_COMMIT" 2>&1
	# Restore test scripts as untracked files
	mkdir -p examples/awq/regression_tests
	cp -r "$BACKUP_DIR"/* examples/awq/regression_tests/ 2>/dev/null \|\| true
	pip install -e . --quiet 2>&1
	fi
	}

	# ── Helper: print current results summary ────────────────────────────────────

	print_summary() {
	echo ""
	echo "╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗"
	echo "║ RESULTS SUMMARY (so far) ║"
	echo "╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝"
	echo ""
	if [ -f "$RESULTS_CSV" ]; then
	# Print header + all rows as a formatted table
	column -t -s',' < "$RESULTS_CSV"
	else
	echo "(no results yet)"
	fi
	echo ""
	echo "════════════════════════════════════════════════════════════════════════════════════════════════════════"
	echo ""
	}

	# ── Initialize results CSV (preserve previous results) ──────────────────────

	if [ -f "$RESULTS_CSV" ]; then
	cp "$RESULTS_CSV" "${RESULTS_CSV}.bak"
	fi
	echo "model,scheme,code_state,task,metric,status,eval_backend,save_dir" > "$RESULTS_CSV"

	# ── Main loop ────────────────────────────────────────────────────────────────

	TOTAL=0
	PASSED=0
	FAILED=0

	for model_idx in "${!SCRIPTS[@]}"; do
	script="${SCRIPTS[$model_idx]}"
	model_name="${MODEL_SHORT_NAMES[$model_idx]}"
	IFS=',' read -r max_model_len tp_size num_gpus_quant <<< "${MODEL_VLLM_ARGS[$model_idx]}"

	for scheme in "${SCHEMES[@]}"; do
	for code_state in "${CODE_STATES[@]}"; do

	save_dir="$MODEL_BASE_DIR/${model_name}-${scheme}-${code_state}"

	echo ""
	echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
	echo "║ MODEL: $model_name"
	echo "║ SCHEME: $scheme"
	echo "║ CODE STATE: $code_state"
	echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
	echo ""

	# ── Skip entirely if all evals already have results ────
	all_evals_cached=true
	for eval_idx in "${!EVAL_NAMES[@]}"; do
	eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${code_state}/${EVAL_NAMES[$eval_idx]}"
	if ! find "$eval_dir" -name "results_*.json" -type f 2>/dev/null \| grep -q .; then
	all_evals_cached=false
	break
	fi
	done
	if [ "$all_evals_cached" = true ]; then
	echo "All evals already cached, skipping quantization and eval."
	for eval_idx in "${!EVAL_NAMES[@]}"; do
	eval_name="${EVAL_NAMES[$eval_idx]}"
	lm_task="${EVAL_LM_TASKS[$eval_idx]}"
	eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${code_state}/${eval_name}"
	metric_val=$(extract_metric "$eval_dir" "$lm_task")
	echo " $eval_name: $metric_val"
	echo "$model_name,$scheme,$code_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
	PASSED=$((PASSED + 1))
	TOTAL=$((TOTAL + 1))
	done
	print_summary
	continue
	fi

	# ── Quantize (skip if model already exists) ────────────
	if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then
	echo "Quantized model already exists at $save_dir, skipping quantization."
	else
	# Switch code state
	switch_code_state "$code_state"

	echo "============================================"
	echo "Running: $script --scheme $scheme"
	echo "============================================"

	if [ "$num_gpus_quant" -gt 1 ]; then
	torchrun --nproc_per_node="$num_gpus_quant" "$script" \
	--scheme "$scheme" --save-dir "$save_dir" 2>&1
	else
	python "$script" --scheme "$scheme" --save-dir "$save_dir" 2>&1
	fi

	quant_status=$?
	if [ $quant_status -ne 0 ]; then
	echo "QUANTIZATION FAILED for $model_name / $scheme / $code_state"
	for eval_name in "${EVAL_NAMES[@]}"; do
	echo "$model_name,$scheme,$code_state,$eval_name,N/A,QUANT_FAILED,N/A,$save_dir" >> "$RESULTS_CSV"
	done
	FAILED=$((FAILED + ${#EVAL_NAMES[@]}))
	TOTAL=$((TOTAL + ${#EVAL_NAMES[@]}))
	print_summary
	continue
	fi
	fi

	# ── Clear GPU memory before eval ─────────────────────────
	python3 -c "import torch; torch.cuda.empty_cache(); [torch.cuda.reset_peak_memory_stats(i) for i in range(torch.cuda.device_count())]" 2>/dev/null

	# ── Evaluate ─────────────────────────────────────────────
	for eval_idx in "${!EVAL_NAMES[@]}"; do
	eval_name="${EVAL_NAMES[$eval_idx]}"
	lm_task="${EVAL_LM_TASKS[$eval_idx]}"
	fewshot="${EVAL_FEWSHOT[$eval_idx]}"
	backend="${EVAL_BACKENDS[$eval_idx]}"
	eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${code_state}/${eval_name}"

	TOTAL=$((TOTAL + 1))

	# Skip eval if results already exist
	existing_result=$(find "$eval_dir" -name "results_*.json" -type f 2>/dev/null \| sort \| tail -1)
	if [ -n "$existing_result" ]; then
	metric_val=$(extract_metric "$eval_dir" "$lm_task")
	echo " EVAL: $eval_name — skipping, previous result found: $metric_val"
	echo "$model_name,$scheme,$code_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
	PASSED=$((PASSED + 1))
	continue
	fi

	if [ "$backend" == "hf" ]; then
	run_hf_eval "$save_dir" "$lm_task" "$fewshot" "$eval_dir"
	else
	run_vllm_eval "$save_dir" "$lm_task" "$fewshot" "$max_model_len" "$tp_size" "$eval_dir"
	fi
	eval_status=$?

	if [ $eval_status -eq 0 ]; then
	metric_val=$(extract_metric "$eval_dir" "$lm_task")
	echo "$model_name,$scheme,$code_state,$eval_name,$metric_val,PASSED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
	PASSED=$((PASSED + 1))
	else
	echo "$model_name,$scheme,$code_state,$eval_name,N/A,FAILED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
	FAILED=$((FAILED + 1))
	fi
	done

	# ── Clean up model to free disk space ────────────────────
	if [ -d "$save_dir" ]; then
	echo "Removing quantized model at $save_dir to free disk space."
	rm -rf "$save_dir"
	fi

	print_summary

	done # code_state
	done # scheme
	done # model

	# ── Final Summary ────────────────────────────────────────────────────────────

	echo ""
	echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
	echo "║ FINAL SUMMARY: $PASSED passed, $FAILED failed out of $TOTAL total evaluations ║"
	echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
	echo ""
	print_summary
	echo "Results CSV: $RESULTS_CSV"
	echo "Saved models: $MODEL_BASE_DIR/"
	echo "Eval outputs: $EVAL_BASE_DIR/"
	echo ""
	echo "To extract detailed metrics from the log:"
	echo " python extract_log_summary.py regression_results.log"