HDCharles · April 28, 2026 20:02
diff --git a/quantize.py b/quantize.py
 import argparse
 import time

 import torch
 from compressed_tensors.offload import dispatch_model
 from compressed_tensors.quantization import preset_name_to_scheme
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer

 from llmcompressor import oneshot
 from llmcompressor.modifiers.gptq import GPTQModifier
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.modifiers.transform.awq import AWQModifier
 from llmcompressor.modifiers.transform.imatrix import IMatrixGatherer
 from llmcompressor.modifiers.transform.smoothquant import SmoothQuantModifier

 MODEL_CONFIGS = {
    "meta-llama/Meta-Llama-3-8B-Instruct": {
        "ignore": ["lm_head"],
        "is_moe": False,
    },
    "Qwen/Qwen3-30B-A3B": {
        "ignore": ["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
        "is_moe": True,
    },
 }

 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"


 def build_recipe(technique, scheme, ignore, is_moe):
    if technique == "awq_rtn":
        duo = "both" if not is_moe else False
        return [
            AWQModifier(duo_scaling=duo),
            QuantizationModifier(
                ignore=ignore, scheme=scheme, targets=["Linear"]
            ),
        ]
    elif technique == "rtn":
        return [
            QuantizationModifier(
                ignore=ignore, scheme=scheme, targets=["Linear"]
            ),
        ]
    elif technique == "rtn_mse":
        scheme_obj = preset_name_to_scheme(scheme, ["Linear"])
        scheme_obj.weights.observer = "memoryless_mse"
        return [
            QuantizationModifier(
                config_groups={"group_0": scheme_obj},
                ignore=ignore,
            ),
        ]
    elif technique == "gptq":
        recipe = []
        if "W8A8" in scheme:
            recipe.append(SmoothQuantModifier(smoothing_strength=0.8))
        recipe.append(
            GPTQModifier(ignore=ignore, scheme=scheme, targets=["Linear"])
        )
        return recipe
    elif technique == "imatrix":
        scheme_obj = preset_name_to_scheme(scheme, ["Linear"])
        scheme_obj.weights.observer = "imatrix_mse"
        return [
            IMatrixGatherer(ignore=ignore),
            QuantizationModifier(
                config_groups={"group_0": scheme_obj},
                ignore=ignore,
            ),
        ]
    else:
        raise ValueError(f"Unknown technique: {technique}")


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True)
    parser.add_argument(
        "--technique", required=True,
        choices=["awq_rtn", "rtn", "rtn_mse", "gptq", "imatrix"],
    )
    parser.add_argument("--scheme", required=True)
    parser.add_argument("--save-dir", required=True)
    parser.add_argument("--num-samples", type=int, default=256)
    parser.add_argument("--max-seq-length", type=int, default=512)
    args = parser.parse_args()

    config = MODEL_CONFIGS.get(args.model)
    if config is None:
        raise ValueError(
            f"Unknown model: {args.model}. "
            f"Known models: {list(MODEL_CONFIGS.keys())}"
        )

    model = AutoModelForCausalLM.from_pretrained(args.model, dtype="auto")
    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)

    ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{args.num_samples}]")
    ds = ds.shuffle(seed=42)

    def preprocess(example):
        return {
            "text": tokenizer.apply_chat_template(
                example["messages"],
                tokenize=False,
            )
        }

    ds = ds.map(preprocess)

    def tokenize(sample):
        return tokenizer(
            sample["text"],
            padding=False,
            max_length=args.max_seq_length,
            truncation=True,
            add_special_tokens=False,
        )

    ds = ds.map(tokenize, remove_columns=ds.column_names)

    recipe = build_recipe(
        args.technique, args.scheme, config["ignore"], config["is_moe"]
    )

    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()

    oneshot_kwargs = dict(
        model=model,
        dataset=ds,
        recipe=recipe,
        max_seq_length=args.max_seq_length,
        num_calibration_samples=args.num_samples,
    )

    oneshot(**oneshot_kwargs)

    elapsed_time = time.time() - start_time
    peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
    print("Quantization Complete")
    print(f"Technique: {args.technique}, Scheme: {args.scheme}")
    print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
    print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")

    print("\n\n")
    print("========== SAMPLE GENERATION ==============")
    dispatch_model(model)
    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
        model.device
    )
    output = model.generate(input_ids, max_new_tokens=100)
    print(tokenizer.decode(output[0]))
    print("==========================================\n\n")

    model.save_pretrained(args.save_dir, save_compressed=True)
    tokenizer.save_pretrained(args.save_dir)
    print(f"Model saved to {args.save_dir}")


 if __name__ == "__main__":
    main()
diff --git a/run_all_tests.sh b/run_all_tests.sh
 #!/bin/bash
 # Observer Refactoring Regression Test Suite
 # Tests all combinations of models x techniques x schemes x branches.
 # For each combo, checks out main and the PR branch, reinstalls, quantizes, and evals.
 #
 # Usage:
 #   ./run_all_tests.sh 2>&1 | tee regression_results.log
 #   python extract_log_summary.py regression_results.log

 set -o pipefail

 # Avoid permission errors on shared HF cache files
 export HF_DATASETS_CACHE="/tmp/hf_datasets_cache"
 mkdir -p "$HF_DATASETS_CACHE"

 # ── Configuration ────────────────────────────────────────────────────────────

 REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)"

 MODELS=("meta-llama/Meta-Llama-3-8B-Instruct" "Qwen/Qwen3-30B-A3B")
 MODEL_SHORT_NAMES=("Meta-Llama-3-8B-Instruct" "Qwen3-30B-A3B")
 # max_model_len,tensor_parallel_size,num_gpus_quant
 MODEL_VLLM_ARGS=("2048,1,1" "2048,2,1")

 TECHNIQUES=("awq_rtn" "rtn" "rtn_mse" "gptq" "imatrix")

 SCHEMES=("W4A16" "W8A8")

 BRANCHES=("main" "90_refactor_obs")

 EVAL_NAMES=("gsm8k_platinum" "wikitext" "mmlu")
 EVAL_LM_TASKS=("gsm8k_platinum" "wikitext" "mmlu")
 EVAL_FEWSHOT=("5" "0" "5")
 EVAL_BACKENDS=("vllm" "vllm" "vllm")

 EVAL_BASE_DIR="./eval_results"
 MODEL_BASE_DIR="./regression_models"
 RESULTS_CSV="regression_results.csv"

 ORIGINAL_BRANCH=$(git -C "$REPO_DIR" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")

 mkdir -p "$EVAL_BASE_DIR" "$MODEL_BASE_DIR"

 # ── Helper: activate environments ────────────────────────────────────────────

 activate_quant_env() {
    source /home/HDCharles/rhdev/bin/activate
 }

 activate_eval_env() {
    source /home/HDCharles/vllm/bin/activate
 }

 # ── Helper: checkout branch and reinstall ────────────────────────────────────

 switch_branch() {
    local branch=$1
    echo "  Switching to branch: $branch"
    git -C "$REPO_DIR" checkout "$branch" 2>&1
    if [ $? -ne 0 ]; then
        echo "  ERROR: git checkout $branch failed"
        return 1
    fi
    activate_quant_env
    pip install -e "$REPO_DIR" 2>&1 | tail -1
    echo "  Installed llm-compressor from branch $branch"
 }

 # ── Helper: run vLLM evaluation with fallback chain ──────────────────────────

 EVAL_BACKEND=""

 run_vllm_eval() {
    local save_dir=$1
    local task=$2
    local num_fewshot=$3
    local max_model_len=$4
    local tp_size=$5
    local eval_output_dir=$6

    mkdir -p "$eval_output_dir"
    EVAL_BACKEND="FAILED"

    activate_eval_env

    echo "  EVAL: $task (fewshot=$num_fewshot, tp=$tp_size, max_len=$max_model_len)"

    local chat_args="--apply_chat_template"
    if [ "$num_fewshot" -gt 0 ]; then
        chat_args="$chat_args --fewshot_as_multiturn"
    fi

    if [ "$tp_size" -gt 1 ]; then
        lm_eval \
            --model vllm \
            --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,tensor_parallel_size=$tp_size,gpu_memory_utilization=0.85" \
            --tasks "$task" \
            --num_fewshot "$num_fewshot" \
            --batch_size auto \
            $chat_args \
            --output_path "$eval_output_dir" 2>&1
        if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp${tp_size}"; return 0; fi

        echo "  TP=$tp_size failed, trying expert_parallel..."
        lm_eval \
            --model vllm \
            --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enable_expert_parallel=True,gpu_memory_utilization=0.85" \
            --tasks "$task" \
            --num_fewshot "$num_fewshot" \
            --batch_size auto \
            $chat_args \
            --output_path "$eval_output_dir" 2>&1
        if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_expert_parallel"; return 0; fi
    fi

    echo "  Trying TP=1..."
    lm_eval \
        --model vllm \
        --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,gpu_memory_utilization=0.85" \
        --tasks "$task" \
        --num_fewshot "$num_fewshot" \
        --batch_size auto \
        $chat_args \
        --output_path "$eval_output_dir" 2>&1
    if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp1"; return 0; fi

    echo "  Trying enforce_eager..."
    lm_eval \
        --model vllm \
        --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enforce_eager=True,gpu_memory_utilization=0.85" \
        --tasks "$task" \
        --num_fewshot "$num_fewshot" \
        --batch_size auto \
        $chat_args \
        --output_path "$eval_output_dir" 2>&1
    if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_eager"; return 0; fi

    echo "  Trying hf backend as last resort..."
    lm_eval \
        --model hf \
        --model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
        --tasks "$task" \
        --num_fewshot "$num_fewshot" \
        --batch_size auto \
        $chat_args \
        --output_path "$eval_output_dir" 2>&1
    if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi

    EVAL_BACKEND="FAILED"
    return 1
 }

 # ── Helper: run HF-only evaluation ─────────────────────────────────────────

 run_hf_eval() {
    local save_dir=$1
    local task=$2
    local num_fewshot=$3
    local eval_output_dir=$4

    mkdir -p "$eval_output_dir"
    EVAL_BACKEND="FAILED"

    activate_eval_env

    echo "  EVAL: $task (fewshot=$num_fewshot, backend=hf)"

    local chat_args="--apply_chat_template"
    if [ "$num_fewshot" -gt 0 ]; then
        chat_args="$chat_args --fewshot_as_multiturn"
    fi

    lm_eval \
        --model hf \
        --model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
        --tasks "$task" \
        --num_fewshot "$num_fewshot" \
        --batch_size auto \
        $chat_args \
        --output_path "$eval_output_dir" 2>&1
    if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi

    EVAL_BACKEND="FAILED"
    return 1
 }

 # ── Helper: extract metric from lm_eval JSON results ────────────────────────

 extract_metric() {
    local eval_output_dir=$1
    local task=$2

    local results_json
    results_json=$(find "$eval_output_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1)

    if [ -z "$results_json" ]; then
        echo "N/A"
        return
    fi

    python3 -c "
 import json, sys
 with open('$results_json') as f:
    data = json.load(f)
 results = data.get('results', {})
 task = '$task'

 task_results = None
 for key in results:
    if task in key:
        task_results = results[key]
        break

 if task_results is None:
    print('N/A')
    sys.exit()

 if 'gsm8k' in task:
    val = task_results.get('exact_match,strict-match')
    if val is not None:
        print(f'{val*100:.2f}%')
    else:
        print('N/A')
 elif 'wikitext' in task:
    val = task_results.get('word_perplexity,none')
    if val is not None:
        print(f'{val:.2f}')
    else:
        print('N/A')
 elif 'mmlu' in task:
    val = task_results.get('acc,none')
    if val is not None:
        print(f'{val*100:.2f}%')
    else:
        print('N/A')
 else:
    for k, v in task_results.items():
        if 'stderr' not in k and k != 'alias' and isinstance(v, (int, float)):
            print(f'{v:.4f}')
            sys.exit()
    print('N/A')
 " 2>/dev/null || echo "N/A"
 }

 # ── Helper: print current results summary ────────────────────────────────────

 print_summary() {
    echo ""
    echo "╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗"
    echo "║  RESULTS SUMMARY (so far)                                                                         ║"
    echo "╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝"
    echo ""
    if [ -f "$RESULTS_CSV" ]; then
        column -t -s',' < "$RESULTS_CSV"
    else
        echo "(no results yet)"
    fi
    echo ""
    echo "════════════════════════════════════════════════════════════════════════════════════════════════════════"
    echo ""
 }

 # ── Helper: print branch comparison table ─────────────────────────────────

 print_comparison() {
    if [ ! -f "$RESULTS_CSV" ]; then
        return
    fi

    python3 - "$RESULTS_CSV" <<'PYEOF'
 import csv, sys

 csv_path = sys.argv[1]

 rows = []
 with open(csv_path) as f:
    reader = csv.DictReader(f)
    for r in reader:
        rows.append(r)

 if not rows:
    sys.exit()

 # Build lookup: (model, scheme, technique, task) -> {branch: metric}
 lookup = {}
 for r in rows:
    key = (r["model"], r["scheme"], r["technique"], r["task"])
    lookup.setdefault(key, {})
    lookup[key][r["branch"]] = r["metric"]

 entries = [(k, v) for k, v in lookup.items()
           if "main" in v and any(b != "main" for b in v)]
 if not entries:
    sys.exit()

 pr_branch = [b for b in next(iter(lookup.values())) if b != "main"]
 pr_branch = pr_branch[0] if pr_branch else "pr"

 def parse_metric(s):
    s = s.strip()
    if s.endswith("%"):
        return float(s[:-1]), True
    try:
        return float(s), False
    except ValueError:
        return None, False

 def calc_change(main_str, pr_str, task):
    m_val, _ = parse_metric(main_str)
    p_val, _ = parse_metric(pr_str)
    if m_val is None or p_val is None or m_val == 0:
        return "N/A"
    if "wikitext" in task:
        pct = (m_val - p_val) / m_val * 100
    else:
        pct = (p_val - m_val) / m_val * 100
    sign = "+" if pct >= 0 else ""
    return f"{sign}{pct:.2f}%"

 print("")
 print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗")
 print(f"║  BRANCH COMPARISON (main vs {pr_branch})")
 print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝")
 print("")

 header = (f"{'model':<30} {'scheme':<10} {'technique':<12} {'task':<18} "
          f"{'main':>14} {'PR':>14} {'change':>12}")
 print(header)
 print("-" * len(header))

 for (model, scheme, technique, task), metrics in sorted(entries):
    m = metrics.get("main", "")
    p = metrics.get(pr_branch, "")
    change = calc_change(m, p, task) if m and p else ""
    print(f"{model:<30} {scheme:<10} {technique:<12} {task:<18} "
          f"{m:>14} {p:>14} {change:>12}")

 print("")
 PYEOF
 }

 # ── Initialize results CSV ──────────────────────────────────────────────────

 if [ -f "$RESULTS_CSV" ]; then
    cp "$RESULTS_CSV" "${RESULTS_CSV}.bak"
 fi
 echo "model,scheme,technique,branch,task,metric,status,eval_backend,save_dir" > "$RESULTS_CSV"

 # ── Main loop ────────────────────────────────────────────────────────────────

 TOTAL=0
 PASSED=0
 FAILED=0

 for model_idx in "${!MODELS[@]}"; do
    model="${MODELS[$model_idx]}"
    model_short="${MODEL_SHORT_NAMES[$model_idx]}"
    IFS=',' read -r max_model_len tp_size num_gpus_quant <<< "${MODEL_VLLM_ARGS[$model_idx]}"

    for technique in "${TECHNIQUES[@]}"; do
        for scheme in "${SCHEMES[@]}"; do
            for branch in "${BRANCHES[@]}"; do

                save_dir="$MODEL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}"

                echo ""
                echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
                echo "║  MODEL: $model_short"
                echo "║  SCHEME: $scheme"
                echo "║  TECHNIQUE: $technique"
                echo "║  BRANCH: $branch"
                echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
                echo ""

                # ── Skip entirely if all evals already have results ────
                all_evals_cached=true
                for eval_idx in "${!EVAL_NAMES[@]}"; do
                    eval_dir="$EVAL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}/${EVAL_NAMES[$eval_idx]}"
                    if ! find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | grep -q .; then
                        all_evals_cached=false
                        break
                    fi
                done
                if [ "$all_evals_cached" = true ]; then
                    echo "All evals already cached, skipping quantization and eval."
                    for eval_idx in "${!EVAL_NAMES[@]}"; do
                        eval_name="${EVAL_NAMES[$eval_idx]}"
                        lm_task="${EVAL_LM_TASKS[$eval_idx]}"
                        eval_dir="$EVAL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}/${eval_name}"
                        metric_val=$(extract_metric "$eval_dir" "$lm_task")
                        echo "  $eval_name: $metric_val"
                        echo "$model_short,$scheme,$technique,$branch,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
                        PASSED=$((PASSED + 1))
                        TOTAL=$((TOTAL + 1))
                    done
                    print_summary
                    print_comparison
                    continue
                fi

                # ── Switch branch and reinstall ───────────────────────
                switch_branch "$branch"
                if [ $? -ne 0 ]; then
                    echo "BRANCH SWITCH FAILED for $branch"
                    for eval_name in "${EVAL_NAMES[@]}"; do
                        echo "$model_short,$scheme,$technique,$branch,$eval_name,N/A,BRANCH_FAILED,N/A,$save_dir" >> "$RESULTS_CSV"
                    done
                    FAILED=$((FAILED + ${#EVAL_NAMES[@]}))
                    TOTAL=$((TOTAL + ${#EVAL_NAMES[@]}))
                    print_summary
                    continue
                fi

                # ── Quantize (skip if model already exists) ────────────
                if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then
                    echo "Quantized model already exists at $save_dir, skipping quantization."
                else
                    activate_quant_env

                    echo "============================================"
                    echo "Running: quantize.py (model=$model_short, technique=$technique, scheme=$scheme, branch=$branch)"
                    echo "============================================"

                    if [ "$num_gpus_quant" -gt 1 ]; then
                        torchrun --nproc_per_node="$num_gpus_quant" "$REPO_DIR/testing/quantize.py" \
                            --model "$model" --technique "$technique" --scheme "$scheme" \
                            --save-dir "$save_dir" 2>&1
                    else
                        python "$REPO_DIR/testing/quantize.py" \
                            --model "$model" --technique "$technique" --scheme "$scheme" \
                            --save-dir "$save_dir" 2>&1
                    fi

                    quant_status=$?
                    if [ $quant_status -ne 0 ]; then
                        echo "QUANTIZATION FAILED for $model_short / $scheme / $technique / $branch"
                        for eval_name in "${EVAL_NAMES[@]}"; do
                            echo "$model_short,$scheme,$technique,$branch,$eval_name,N/A,QUANT_FAILED,N/A,$save_dir" >> "$RESULTS_CSV"
                        done
                        FAILED=$((FAILED + ${#EVAL_NAMES[@]}))
                        TOTAL=$((TOTAL + ${#EVAL_NAMES[@]}))
                        print_summary
                        print_comparison
                        continue
                    fi
                fi

                # ── Clear GPU memory before eval ─────────────────────────
                python3 -c "import torch; torch.cuda.empty_cache(); [torch.cuda.reset_peak_memory_stats(i) for i in range(torch.cuda.device_count())]" 2>/dev/null

                # ── Evaluate ─────────────────────────────────────────────
                for eval_idx in "${!EVAL_NAMES[@]}"; do
                    eval_name="${EVAL_NAMES[$eval_idx]}"
                    lm_task="${EVAL_LM_TASKS[$eval_idx]}"
                    fewshot="${EVAL_FEWSHOT[$eval_idx]}"
                    backend="${EVAL_BACKENDS[$eval_idx]}"
                    eval_dir="$EVAL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}/${eval_name}"

                    TOTAL=$((TOTAL + 1))

                    # Skip eval if results already exist
                    existing_result=$(find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1)
                    if [ -n "$existing_result" ]; then
                        metric_val=$(extract_metric "$eval_dir" "$lm_task")
                        echo "  EVAL: $eval_name — skipping, previous result found: $metric_val"
                        echo "$model_short,$scheme,$technique,$branch,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
                        PASSED=$((PASSED + 1))
                        continue
                    fi

                    if [ "$backend" == "hf" ]; then
                        run_hf_eval "$save_dir" "$lm_task" "$fewshot" "$eval_dir"
                    else
                        run_vllm_eval "$save_dir" "$lm_task" "$fewshot" "$max_model_len" "$tp_size" "$eval_dir"
                    fi
                    eval_status=$?

                    if [ $eval_status -eq 0 ]; then
                        metric_val=$(extract_metric "$eval_dir" "$lm_task")
                        echo "$model_short,$scheme,$technique,$branch,$eval_name,$metric_val,PASSED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
                        PASSED=$((PASSED + 1))
                    else
                        echo "$model_short,$scheme,$technique,$branch,$eval_name,N/A,FAILED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
                        FAILED=$((FAILED + 1))
                    fi
                done

                # ── Clean up model to free disk space ────────────────────
                if [ -d "$save_dir" ]; then
                    echo "Removing quantized model at $save_dir to free disk space."
                    rm -rf "$save_dir"
                fi

                print_summary
                print_comparison

            done  # branch
        done  # scheme
    done  # technique
 done  # model

 # ── Restore original branch ─────────────────────────────────────────────────

 echo "Restoring original branch: $ORIGINAL_BRANCH"
 git -C "$REPO_DIR" checkout "$ORIGINAL_BRANCH" 2>&1
 activate_quant_env
 pip install -e "$REPO_DIR" 2>&1 | tail -1

 # ── Final Summary ────────────────────────────────────────────────────────────

 echo ""
 echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
 echo "║  FINAL SUMMARY: $PASSED passed, $FAILED failed out of $TOTAL total evaluations         ║"
 echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
 echo ""
 print_summary
 print_comparison
 echo "Results CSV: $RESULTS_CSV"
 echo "Eval outputs: $EVAL_BASE_DIR/"
diff --git a/test_nvfp4.py b/test_nvfp4.py
 """
 Standalone test for NVFP4 and NVFP4A16 quantization schemes.

 NVFP4  = W4A4 (weights + activations quantized to FP4, requires calibration data)
 NVFP4A16 = W4A16 (weights FP4, activations FP16, data-free)

 Both require Blackwell GPUs (compute capability 10.0+).

 Usage:
    python testing/test_nvfp4.py --scheme NVFP4
    python testing/test_nvfp4.py --scheme NVFP4A16
    python testing/test_nvfp4.py --scheme all
 """

 import argparse
 import shutil
 import time

 import torch
 from compressed_tensors.offload import dispatch_model
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer

 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier

 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"
 NUM_CALIBRATION_SAMPLES = 256
 MAX_SEQUENCE_LENGTH = 512
 IGNORE = ["lm_head"]


 def load_model():
    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    return model, tokenizer


 def load_calibration_data(tokenizer):
    ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
    ds = ds.shuffle(seed=42)

    def preprocess(example):
        return {
            "text": tokenizer.apply_chat_template(
                example["messages"], tokenize=False
            )
        }

    ds = ds.map(preprocess)

    def tokenize(sample):
        return tokenizer(
            sample["text"],
            padding=False,
            max_length=MAX_SEQUENCE_LENGTH,
            truncation=True,
            add_special_tokens=False,
        )

    ds = ds.map(tokenize, remove_columns=ds.column_names)
    return ds


 def sample_generate(model, tokenizer):
    dispatch_model(model)
    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
        model.device
    )
    output = model.generate(input_ids, max_new_tokens=100)
    print(tokenizer.decode(output[0]))


 def run_nvfp4(args):
    print("=" * 60)
    print("NVFP4 (W4A4) — weights + activations quantized to FP4")
    print("=" * 60)

    model, tokenizer = load_model()
    ds = load_calibration_data(tokenizer)

    recipe = QuantizationModifier(
        targets="Linear", scheme="NVFP4", ignore=IGNORE
    )

    torch.cuda.reset_peak_memory_stats()
    start = time.time()

    oneshot(
        model=model,
        dataset=ds,
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    )

    elapsed = time.time() - start
    peak_gb = torch.cuda.max_memory_allocated() / (1024**3)
    print(f"Time: {elapsed / 60:.2f} min | Peak GPU: {peak_gb:.2f} GB")

    print("\n--- Sample Generation ---")
    sample_generate(model, tokenizer)

    save_dir = args.save_dir or "nvfp4_test_model"
    model.save_pretrained(save_dir, save_compressed=True)
    tokenizer.save_pretrained(save_dir)
    print(f"Saved to {save_dir}")

    if args.cleanup:
        shutil.rmtree(save_dir)
        print(f"Cleaned up {save_dir}")

    del model
    torch.cuda.empty_cache()


 def run_nvfp4a16(args):
    print("=" * 60)
    print("NVFP4A16 (W4A16) — weights FP4, activations FP16 (data-free)")
    print("=" * 60)

    model, tokenizer = load_model()

    recipe = QuantizationModifier(
        targets="Linear", scheme="NVFP4A16", ignore=IGNORE
    )

    torch.cuda.reset_peak_memory_stats()
    start = time.time()

    oneshot(model=model, recipe=recipe)

    elapsed = time.time() - start
    peak_gb = torch.cuda.max_memory_allocated() / (1024**3)
    print(f"Time: {elapsed / 60:.2f} min | Peak GPU: {peak_gb:.2f} GB")

    print("\n--- Sample Generation ---")
    sample_generate(model, tokenizer)

    save_dir = args.save_dir or "nvfp4a16_test_model"
    model.save_pretrained(save_dir, save_compressed=True)
    tokenizer.save_pretrained(save_dir)
    print(f"Saved to {save_dir}")

    if args.cleanup:
        shutil.rmtree(save_dir)
        print(f"Cleaned up {save_dir}")

    del model
    torch.cuda.empty_cache()


 def main():
    parser = argparse.ArgumentParser(description="Test NVFP4 / NVFP4A16 quantization")
    parser.add_argument(
        "--scheme",
        required=True,
        choices=["NVFP4", "NVFP4A16", "all"],
        help="Which scheme to test",
    )
    parser.add_argument("--save-dir", default=None, help="Override save directory")
    parser.add_argument(
        "--cleanup",
        action="store_true",
        help="Delete saved model after test (saves disk space)",
    )
    args = parser.parse_args()

    if args.scheme in ("NVFP4", "all"):
        run_nvfp4(args)

    if args.scheme in ("NVFP4A16", "all"):
        run_nvfp4a16(args)

    print("\nAll requested NVFP4 tests passed.")


 if __name__ == "__main__":
    main()
	import argparse
	import time

	import torch
	from compressed_tensors.offload import dispatch_model
	from compressed_tensors.quantization import preset_name_to_scheme
	from datasets import load_dataset
	from transformers import AutoModelForCausalLM, AutoTokenizer

	from llmcompressor import oneshot
	from llmcompressor.modifiers.gptq import GPTQModifier
	from llmcompressor.modifiers.quantization import QuantizationModifier
	from llmcompressor.modifiers.transform.awq import AWQModifier
	from llmcompressor.modifiers.transform.imatrix import IMatrixGatherer
	from llmcompressor.modifiers.transform.smoothquant import SmoothQuantModifier

	MODEL_CONFIGS = {
	"meta-llama/Meta-Llama-3-8B-Instruct": {
	"ignore": ["lm_head"],
	"is_moe": False,
	},
	"Qwen/Qwen3-30B-A3B": {
	"ignore": ["lm_head", "re:.mlp.gate$", "re:.mlp.shared_expert_gate$"],
	"is_moe": True,
	},
	}

	DATASET_ID = "HuggingFaceH4/ultrachat_200k"
	DATASET_SPLIT = "train_sft"


	def build_recipe(technique, scheme, ignore, is_moe):
	if technique == "awq_rtn":
	duo = "both" if not is_moe else False
	return [
	AWQModifier(duo_scaling=duo),
	QuantizationModifier(
	ignore=ignore, scheme=scheme, targets=["Linear"]
	),
	]
	elif technique == "rtn":
	return [
	QuantizationModifier(
	ignore=ignore, scheme=scheme, targets=["Linear"]
	),
	]
	elif technique == "rtn_mse":
	scheme_obj = preset_name_to_scheme(scheme, ["Linear"])
	scheme_obj.weights.observer = "memoryless_mse"
	return [
	QuantizationModifier(
	config_groups={"group_0": scheme_obj},
	ignore=ignore,
	),
	]
	elif technique == "gptq":
	recipe = []
	if "W8A8" in scheme:
	recipe.append(SmoothQuantModifier(smoothing_strength=0.8))
	recipe.append(
	GPTQModifier(ignore=ignore, scheme=scheme, targets=["Linear"])
	)
	return recipe
	elif technique == "imatrix":
	scheme_obj = preset_name_to_scheme(scheme, ["Linear"])
	scheme_obj.weights.observer = "imatrix_mse"
	return [
	IMatrixGatherer(ignore=ignore),
	QuantizationModifier(
	config_groups={"group_0": scheme_obj},
	ignore=ignore,
	),
	]
	else:
	raise ValueError(f"Unknown technique: {technique}")


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--model", required=True)
	parser.add_argument(
	"--technique", required=True,
	choices=["awq_rtn", "rtn", "rtn_mse", "gptq", "imatrix"],
	)
	parser.add_argument("--scheme", required=True)
	parser.add_argument("--save-dir", required=True)
	parser.add_argument("--num-samples", type=int, default=256)
	parser.add_argument("--max-seq-length", type=int, default=512)
	args = parser.parse_args()

	config = MODEL_CONFIGS.get(args.model)
	if config is None:
	raise ValueError(
	f"Unknown model: {args.model}. "
	f"Known models: {list(MODEL_CONFIGS.keys())}"
	)

	model = AutoModelForCausalLM.from_pretrained(args.model, dtype="auto")
	tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)

	ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{args.num_samples}]")
	ds = ds.shuffle(seed=42)

	def preprocess(example):
	return {
	"text": tokenizer.apply_chat_template(
	example["messages"],
	tokenize=False,
	)
	}

	ds = ds.map(preprocess)

	def tokenize(sample):
	return tokenizer(
	sample["text"],
	padding=False,
	max_length=args.max_seq_length,
	truncation=True,
	add_special_tokens=False,
	)

	ds = ds.map(tokenize, remove_columns=ds.column_names)

	recipe = build_recipe(
	args.technique, args.scheme, config["ignore"], config["is_moe"]
	)

	torch.cuda.reset_peak_memory_stats()
	start_time = time.time()

	oneshot_kwargs = dict(
	model=model,
	dataset=ds,
	recipe=recipe,
	max_seq_length=args.max_seq_length,
	num_calibration_samples=args.num_samples,
	)

	oneshot(**oneshot_kwargs)

	elapsed_time = time.time() - start_time
	peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
	print("Quantization Complete")
	print(f"Technique: {args.technique}, Scheme: {args.scheme}")
	print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
	print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")

	print("\n\n")
	print("========== SAMPLE GENERATION ==============")
	dispatch_model(model)
	input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
	model.device
	)
	output = model.generate(input_ids, max_new_tokens=100)
	print(tokenizer.decode(output[0]))
	print("==========================================\n\n")

	model.save_pretrained(args.save_dir, save_compressed=True)
	tokenizer.save_pretrained(args.save_dir)
	print(f"Model saved to {args.save_dir}")


	if __name__ == "__main__":
	main()
	#!/bin/bash
	# Observer Refactoring Regression Test Suite
	# Tests all combinations of models x techniques x schemes x branches.
	# For each combo, checks out main and the PR branch, reinstalls, quantizes, and evals.
	#
	# Usage:
	# ./run_all_tests.sh 2>&1 \| tee regression_results.log
	# python extract_log_summary.py regression_results.log

	set -o pipefail

	# Avoid permission errors on shared HF cache files
	export HF_DATASETS_CACHE="/tmp/hf_datasets_cache"
	mkdir -p "$HF_DATASETS_CACHE"

	# ── Configuration ────────────────────────────────────────────────────────────

	REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)"

	MODELS=("meta-llama/Meta-Llama-3-8B-Instruct" "Qwen/Qwen3-30B-A3B")
	MODEL_SHORT_NAMES=("Meta-Llama-3-8B-Instruct" "Qwen3-30B-A3B")
	# max_model_len,tensor_parallel_size,num_gpus_quant
	MODEL_VLLM_ARGS=("2048,1,1" "2048,2,1")

	TECHNIQUES=("awq_rtn" "rtn" "rtn_mse" "gptq" "imatrix")

	SCHEMES=("W4A16" "W8A8")

	BRANCHES=("main" "90_refactor_obs")

	EVAL_NAMES=("gsm8k_platinum" "wikitext" "mmlu")
	EVAL_LM_TASKS=("gsm8k_platinum" "wikitext" "mmlu")
	EVAL_FEWSHOT=("5" "0" "5")
	EVAL_BACKENDS=("vllm" "vllm" "vllm")

	EVAL_BASE_DIR="./eval_results"
	MODEL_BASE_DIR="./regression_models"
	RESULTS_CSV="regression_results.csv"

	ORIGINAL_BRANCH=$(git -C "$REPO_DIR" rev-parse --abbrev-ref HEAD 2>/dev/null \|\| echo "unknown")

	mkdir -p "$EVAL_BASE_DIR" "$MODEL_BASE_DIR"

	# ── Helper: activate environments ────────────────────────────────────────────

	activate_quant_env() {
	source /home/HDCharles/rhdev/bin/activate
	}

	activate_eval_env() {
	source /home/HDCharles/vllm/bin/activate
	}

	# ── Helper: checkout branch and reinstall ────────────────────────────────────

	switch_branch() {
	local branch=$1
	echo " Switching to branch: $branch"
	git -C "$REPO_DIR" checkout "$branch" 2>&1
	if [ $? -ne 0 ]; then
	echo " ERROR: git checkout $branch failed"
	return 1
	fi
	activate_quant_env
	pip install -e "$REPO_DIR" 2>&1 \| tail -1
	echo " Installed llm-compressor from branch $branch"
	}

	# ── Helper: run vLLM evaluation with fallback chain ──────────────────────────

	EVAL_BACKEND=""

	run_vllm_eval() {
	local save_dir=$1
	local task=$2
	local num_fewshot=$3
	local max_model_len=$4
	local tp_size=$5
	local eval_output_dir=$6

	mkdir -p "$eval_output_dir"
	EVAL_BACKEND="FAILED"

	activate_eval_env

	echo " EVAL: $task (fewshot=$num_fewshot, tp=$tp_size, max_len=$max_model_len)"

	local chat_args="--apply_chat_template"
	if [ "$num_fewshot" -gt 0 ]; then
	chat_args="$chat_args --fewshot_as_multiturn"
	fi

	if [ "$tp_size" -gt 1 ]; then
	lm_eval \
	--model vllm \
	--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,tensor_parallel_size=$tp_size,gpu_memory_utilization=0.85" \
	--tasks "$task" \
	--num_fewshot "$num_fewshot" \
	--batch_size auto \
	$chat_args \
	--output_path "$eval_output_dir" 2>&1
	if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp${tp_size}"; return 0; fi

	echo " TP=$tp_size failed, trying expert_parallel..."
	lm_eval \
	--model vllm \
	--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enable_expert_parallel=True,gpu_memory_utilization=0.85" \
	--tasks "$task" \
	--num_fewshot "$num_fewshot" \
	--batch_size auto \
	$chat_args \
	--output_path "$eval_output_dir" 2>&1
	if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_expert_parallel"; return 0; fi
	fi

	echo " Trying TP=1..."
	lm_eval \
	--model vllm \
	--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,gpu_memory_utilization=0.85" \
	--tasks "$task" \
	--num_fewshot "$num_fewshot" \
	--batch_size auto \
	$chat_args \
	--output_path "$eval_output_dir" 2>&1
	if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp1"; return 0; fi

	echo " Trying enforce_eager..."
	lm_eval \
	--model vllm \
	--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enforce_eager=True,gpu_memory_utilization=0.85" \
	--tasks "$task" \
	--num_fewshot "$num_fewshot" \
	--batch_size auto \
	$chat_args \
	--output_path "$eval_output_dir" 2>&1
	if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_eager"; return 0; fi

	echo " Trying hf backend as last resort..."
	lm_eval \
	--model hf \
	--model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
	--tasks "$task" \
	--num_fewshot "$num_fewshot" \
	--batch_size auto \
	$chat_args \
	--output_path "$eval_output_dir" 2>&1
	if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi

	EVAL_BACKEND="FAILED"
	return 1
	}

	# ── Helper: run HF-only evaluation ─────────────────────────────────────────

	run_hf_eval() {
	local save_dir=$1
	local task=$2
	local num_fewshot=$3
	local eval_output_dir=$4

	mkdir -p "$eval_output_dir"
	EVAL_BACKEND="FAILED"

	activate_eval_env

	echo " EVAL: $task (fewshot=$num_fewshot, backend=hf)"

	local chat_args="--apply_chat_template"
	if [ "$num_fewshot" -gt 0 ]; then
	chat_args="$chat_args --fewshot_as_multiturn"
	fi

	lm_eval \
	--model hf \
	--model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
	--tasks "$task" \
	--num_fewshot "$num_fewshot" \
	--batch_size auto \
	$chat_args \
	--output_path "$eval_output_dir" 2>&1
	if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi

	EVAL_BACKEND="FAILED"
	return 1
	}

	# ── Helper: extract metric from lm_eval JSON results ────────────────────────

	extract_metric() {
	local eval_output_dir=$1
	local task=$2

	local results_json
	results_json=$(find "$eval_output_dir" -name "results_*.json" -type f 2>/dev/null \| sort \| tail -1)

	if [ -z "$results_json" ]; then
	echo "N/A"
	return
	fi

	python3 -c "
	import json, sys
	with open('$results_json') as f:
	data = json.load(f)
	results = data.get('results', {})
	task = '$task'

	task_results = None
	for key in results:
	if task in key:
	task_results = results[key]
	break

	if task_results is None:
	print('N/A')
	sys.exit()

	if 'gsm8k' in task:
	val = task_results.get('exact_match,strict-match')
	if val is not None:
	print(f'{val*100:.2f}%')
	else:
	print('N/A')
	elif 'wikitext' in task:
	val = task_results.get('word_perplexity,none')
	if val is not None:
	print(f'{val:.2f}')
	else:
	print('N/A')
	elif 'mmlu' in task:
	val = task_results.get('acc,none')
	if val is not None:
	print(f'{val*100:.2f}%')
	else:
	print('N/A')
	else:
	for k, v in task_results.items():
	if 'stderr' not in k and k != 'alias' and isinstance(v, (int, float)):
	print(f'{v:.4f}')
	sys.exit()
	print('N/A')
	" 2>/dev/null \|\| echo "N/A"
	}

	# ── Helper: print current results summary ────────────────────────────────────

	print_summary() {
	echo ""
	echo "╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗"
	echo "║ RESULTS SUMMARY (so far) ║"
	echo "╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝"
	echo ""
	if [ -f "$RESULTS_CSV" ]; then
	column -t -s',' < "$RESULTS_CSV"
	else
	echo "(no results yet)"
	fi
	echo ""
	echo "════════════════════════════════════════════════════════════════════════════════════════════════════════"
	echo ""
	}

	# ── Helper: print branch comparison table ─────────────────────────────────

	print_comparison() {
	if [ ! -f "$RESULTS_CSV" ]; then
	return
	fi

	python3 - "$RESULTS_CSV" <<'PYEOF'
	import csv, sys

	csv_path = sys.argv[1]

	rows = []
	with open(csv_path) as f:
	reader = csv.DictReader(f)
	for r in reader:
	rows.append(r)

	if not rows:
	sys.exit()

	# Build lookup: (model, scheme, technique, task) -> {branch: metric}
	lookup = {}
	for r in rows:
	key = (r["model"], r["scheme"], r["technique"], r["task"])
	lookup.setdefault(key, {})
	lookup[key][r["branch"]] = r["metric"]

	entries = [(k, v) for k, v in lookup.items()
	if "main" in v and any(b != "main" for b in v)]
	if not entries:
	sys.exit()

	pr_branch = [b for b in next(iter(lookup.values())) if b != "main"]
	pr_branch = pr_branch[0] if pr_branch else "pr"

	def parse_metric(s):
	s = s.strip()
	if s.endswith("%"):
	return float(s[:-1]), True
	try:
	return float(s), False
	except ValueError:
	return None, False

	def calc_change(main_str, pr_str, task):
	m_val, _ = parse_metric(main_str)
	p_val, _ = parse_metric(pr_str)
	if m_val is None or p_val is None or m_val == 0:
	return "N/A"
	if "wikitext" in task:
	pct = (m_val - p_val) / m_val * 100
	else:
	pct = (p_val - m_val) / m_val * 100
	sign = "+" if pct >= 0 else ""
	return f"{sign}{pct:.2f}%"

	print("")
	print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗")
	print(f"║ BRANCH COMPARISON (main vs {pr_branch})")
	print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝")
	print("")

	header = (f"{'model':<30} {'scheme':<10} {'technique':<12} {'task':<18} "
	f"{'main':>14} {'PR':>14} {'change':>12}")
	print(header)
	print("-" * len(header))

	for (model, scheme, technique, task), metrics in sorted(entries):
	m = metrics.get("main", "")
	p = metrics.get(pr_branch, "")
	change = calc_change(m, p, task) if m and p else ""
	print(f"{model:<30} {scheme:<10} {technique:<12} {task:<18} "
	f"{m:>14} {p:>14} {change:>12}")

	print("")
	PYEOF
	}

	# ── Initialize results CSV ──────────────────────────────────────────────────

	if [ -f "$RESULTS_CSV" ]; then
	cp "$RESULTS_CSV" "${RESULTS_CSV}.bak"
	fi
	echo "model,scheme,technique,branch,task,metric,status,eval_backend,save_dir" > "$RESULTS_CSV"

	# ── Main loop ────────────────────────────────────────────────────────────────

	TOTAL=0
	PASSED=0
	FAILED=0

	for model_idx in "${!MODELS[@]}"; do
	model="${MODELS[$model_idx]}"
	model_short="${MODEL_SHORT_NAMES[$model_idx]}"
	IFS=',' read -r max_model_len tp_size num_gpus_quant <<< "${MODEL_VLLM_ARGS[$model_idx]}"

	for technique in "${TECHNIQUES[@]}"; do
	for scheme in "${SCHEMES[@]}"; do
	for branch in "${BRANCHES[@]}"; do

	save_dir="$MODEL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}"

	echo ""
	echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
	echo "║ MODEL: $model_short"
	echo "║ SCHEME: $scheme"
	echo "║ TECHNIQUE: $technique"
	echo "║ BRANCH: $branch"
	echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
	echo ""

	# ── Skip entirely if all evals already have results ────
	all_evals_cached=true
	for eval_idx in "${!EVAL_NAMES[@]}"; do
	eval_dir="$EVAL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}/${EVAL_NAMES[$eval_idx]}"
	if ! find "$eval_dir" -name "results_*.json" -type f 2>/dev/null \| grep -q .; then
	all_evals_cached=false
	break
	fi
	done
	if [ "$all_evals_cached" = true ]; then
	echo "All evals already cached, skipping quantization and eval."
	for eval_idx in "${!EVAL_NAMES[@]}"; do
	eval_name="${EVAL_NAMES[$eval_idx]}"
	lm_task="${EVAL_LM_TASKS[$eval_idx]}"
	eval_dir="$EVAL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}/${eval_name}"
	metric_val=$(extract_metric "$eval_dir" "$lm_task")
	echo " $eval_name: $metric_val"
	echo "$model_short,$scheme,$technique,$branch,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
	PASSED=$((PASSED + 1))
	TOTAL=$((TOTAL + 1))
	done
	print_summary
	print_comparison
	continue
	fi

	# ── Switch branch and reinstall ───────────────────────
	switch_branch "$branch"
	if [ $? -ne 0 ]; then
	echo "BRANCH SWITCH FAILED for $branch"
	for eval_name in "${EVAL_NAMES[@]}"; do
	echo "$model_short,$scheme,$technique,$branch,$eval_name,N/A,BRANCH_FAILED,N/A,$save_dir" >> "$RESULTS_CSV"
	done
	FAILED=$((FAILED + ${#EVAL_NAMES[@]}))
	TOTAL=$((TOTAL + ${#EVAL_NAMES[@]}))
	print_summary
	continue
	fi

	# ── Quantize (skip if model already exists) ────────────
	if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then
	echo "Quantized model already exists at $save_dir, skipping quantization."
	else
	activate_quant_env

	echo "============================================"
	echo "Running: quantize.py (model=$model_short, technique=$technique, scheme=$scheme, branch=$branch)"
	echo "============================================"

	if [ "$num_gpus_quant" -gt 1 ]; then
	torchrun --nproc_per_node="$num_gpus_quant" "$REPO_DIR/testing/quantize.py" \
	--model "$model" --technique "$technique" --scheme "$scheme" \
	--save-dir "$save_dir" 2>&1
	else
	python "$REPO_DIR/testing/quantize.py" \
	--model "$model" --technique "$technique" --scheme "$scheme" \
	--save-dir "$save_dir" 2>&1
	fi

	quant_status=$?
	if [ $quant_status -ne 0 ]; then
	echo "QUANTIZATION FAILED for $model_short / $scheme / $technique / $branch"
	for eval_name in "${EVAL_NAMES[@]}"; do
	echo "$model_short,$scheme,$technique,$branch,$eval_name,N/A,QUANT_FAILED,N/A,$save_dir" >> "$RESULTS_CSV"
	done
	FAILED=$((FAILED + ${#EVAL_NAMES[@]}))
	TOTAL=$((TOTAL + ${#EVAL_NAMES[@]}))
	print_summary
	print_comparison
	continue
	fi
	fi

	# ── Clear GPU memory before eval ─────────────────────────
	python3 -c "import torch; torch.cuda.empty_cache(); [torch.cuda.reset_peak_memory_stats(i) for i in range(torch.cuda.device_count())]" 2>/dev/null

	# ── Evaluate ─────────────────────────────────────────────
	for eval_idx in "${!EVAL_NAMES[@]}"; do
	eval_name="${EVAL_NAMES[$eval_idx]}"
	lm_task="${EVAL_LM_TASKS[$eval_idx]}"
	fewshot="${EVAL_FEWSHOT[$eval_idx]}"
	backend="${EVAL_BACKENDS[$eval_idx]}"
	eval_dir="$EVAL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}/${eval_name}"

	TOTAL=$((TOTAL + 1))

	# Skip eval if results already exist
	existing_result=$(find "$eval_dir" -name "results_*.json" -type f 2>/dev/null \| sort \| tail -1)
	if [ -n "$existing_result" ]; then
	metric_val=$(extract_metric "$eval_dir" "$lm_task")
	echo " EVAL: $eval_name — skipping, previous result found: $metric_val"
	echo "$model_short,$scheme,$technique,$branch,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
	PASSED=$((PASSED + 1))
	continue
	fi

	if [ "$backend" == "hf" ]; then
	run_hf_eval "$save_dir" "$lm_task" "$fewshot" "$eval_dir"
	else
	run_vllm_eval "$save_dir" "$lm_task" "$fewshot" "$max_model_len" "$tp_size" "$eval_dir"
	fi
	eval_status=$?

	if [ $eval_status -eq 0 ]; then
	metric_val=$(extract_metric "$eval_dir" "$lm_task")
	echo "$model_short,$scheme,$technique,$branch,$eval_name,$metric_val,PASSED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
	PASSED=$((PASSED + 1))
	else
	echo "$model_short,$scheme,$technique,$branch,$eval_name,N/A,FAILED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
	FAILED=$((FAILED + 1))
	fi
	done

	# ── Clean up model to free disk space ────────────────────
	if [ -d "$save_dir" ]; then
	echo "Removing quantized model at $save_dir to free disk space."
	rm -rf "$save_dir"
	fi

	print_summary
	print_comparison

	done # branch
	done # scheme
	done # technique
	done # model

	# ── Restore original branch ─────────────────────────────────────────────────

	echo "Restoring original branch: $ORIGINAL_BRANCH"
	git -C "$REPO_DIR" checkout "$ORIGINAL_BRANCH" 2>&1
	activate_quant_env
	pip install -e "$REPO_DIR" 2>&1 \| tail -1

	# ── Final Summary ────────────────────────────────────────────────────────────

	echo ""
	echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
	echo "║ FINAL SUMMARY: $PASSED passed, $FAILED failed out of $TOTAL total evaluations ║"
	echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
	echo ""
	print_summary
	print_comparison
	echo "Results CSV: $RESULTS_CSV"
	echo "Eval outputs: $EVAL_BASE_DIR/"
	"""
	Standalone test for NVFP4 and NVFP4A16 quantization schemes.

	NVFP4 = W4A4 (weights + activations quantized to FP4, requires calibration data)
	NVFP4A16 = W4A16 (weights FP4, activations FP16, data-free)

	Both require Blackwell GPUs (compute capability 10.0+).

	Usage:
	python testing/test_nvfp4.py --scheme NVFP4
	python testing/test_nvfp4.py --scheme NVFP4A16
	python testing/test_nvfp4.py --scheme all
	"""

	import argparse
	import shutil
	import time

	import torch
	from compressed_tensors.offload import dispatch_model
	from datasets import load_dataset
	from transformers import AutoModelForCausalLM, AutoTokenizer

	from llmcompressor import oneshot
	from llmcompressor.modifiers.quantization import QuantizationModifier

	MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
	DATASET_ID = "HuggingFaceH4/ultrachat_200k"
	DATASET_SPLIT = "train_sft"
	NUM_CALIBRATION_SAMPLES = 256
	MAX_SEQUENCE_LENGTH = 512
	IGNORE = ["lm_head"]


	def load_model():
	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	return model, tokenizer


	def load_calibration_data(tokenizer):
	ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
	ds = ds.shuffle(seed=42)

	def preprocess(example):
	return {
	"text": tokenizer.apply_chat_template(
	example["messages"], tokenize=False
	)
	}

	ds = ds.map(preprocess)

	def tokenize(sample):
	return tokenizer(
	sample["text"],
	padding=False,
	max_length=MAX_SEQUENCE_LENGTH,
	truncation=True,
	add_special_tokens=False,
	)

	ds = ds.map(tokenize, remove_columns=ds.column_names)
	return ds


	def sample_generate(model, tokenizer):
	dispatch_model(model)
	input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
	model.device
	)
	output = model.generate(input_ids, max_new_tokens=100)
	print(tokenizer.decode(output[0]))


	def run_nvfp4(args):
	print("=" * 60)
	print("NVFP4 (W4A4) — weights + activations quantized to FP4")
	print("=" * 60)

	model, tokenizer = load_model()
	ds = load_calibration_data(tokenizer)

	recipe = QuantizationModifier(
	targets="Linear", scheme="NVFP4", ignore=IGNORE
	)

	torch.cuda.reset_peak_memory_stats()
	start = time.time()

	oneshot(
	model=model,
	dataset=ds,
	recipe=recipe,
	max_seq_length=MAX_SEQUENCE_LENGTH,
	num_calibration_samples=NUM_CALIBRATION_SAMPLES,
	)

	elapsed = time.time() - start
	peak_gb = torch.cuda.max_memory_allocated() / (1024**3)
	print(f"Time: {elapsed / 60:.2f} min \| Peak GPU: {peak_gb:.2f} GB")

	print("\n--- Sample Generation ---")
	sample_generate(model, tokenizer)

	save_dir = args.save_dir or "nvfp4_test_model"
	model.save_pretrained(save_dir, save_compressed=True)
	tokenizer.save_pretrained(save_dir)
	print(f"Saved to {save_dir}")

	if args.cleanup:
	shutil.rmtree(save_dir)
	print(f"Cleaned up {save_dir}")

	del model
	torch.cuda.empty_cache()


	def run_nvfp4a16(args):
	print("=" * 60)
	print("NVFP4A16 (W4A16) — weights FP4, activations FP16 (data-free)")
	print("=" * 60)

	model, tokenizer = load_model()

	recipe = QuantizationModifier(
	targets="Linear", scheme="NVFP4A16", ignore=IGNORE
	)

	torch.cuda.reset_peak_memory_stats()
	start = time.time()

	oneshot(model=model, recipe=recipe)

	elapsed = time.time() - start
	peak_gb = torch.cuda.max_memory_allocated() / (1024**3)
	print(f"Time: {elapsed / 60:.2f} min \| Peak GPU: {peak_gb:.2f} GB")

	print("\n--- Sample Generation ---")
	sample_generate(model, tokenizer)

	save_dir = args.save_dir or "nvfp4a16_test_model"
	model.save_pretrained(save_dir, save_compressed=True)
	tokenizer.save_pretrained(save_dir)
	print(f"Saved to {save_dir}")

	if args.cleanup:
	shutil.rmtree(save_dir)
	print(f"Cleaned up {save_dir}")

	del model
	torch.cuda.empty_cache()


	def main():
	parser = argparse.ArgumentParser(description="Test NVFP4 / NVFP4A16 quantization")
	parser.add_argument(
	"--scheme",
	required=True,
	choices=["NVFP4", "NVFP4A16", "all"],
	help="Which scheme to test",
	)
	parser.add_argument("--save-dir", default=None, help="Override save directory")
	parser.add_argument(
	"--cleanup",
	action="store_true",
	help="Delete saved model after test (saves disk space)",
	)
	args = parser.parse_args()

	if args.scheme in ("NVFP4", "all"):
	run_nvfp4(args)

	if args.scheme in ("NVFP4A16", "all"):
	run_nvfp4a16(args)

	print("\nAll requested NVFP4 tests passed.")


	if __name__ == "__main__":
	main()