Created
April 30, 2026 14:58
-
-
Save HDCharles/9c819b08b3db79cfd5116e6e523f61d9 to your computer and use it in GitHub Desktop.
NVFP4 regression test suite: quantize.py and run_all_tests.sh
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import os | |
| import time | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from compressed_tensors.quantization import preset_name_to_scheme | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # Monkey-patch os.chmod to ignore permission errors on shared cache | |
| _original_chmod = os.chmod | |
| def _chmod_ignore_errors(path, mode): | |
| try: | |
| _original_chmod(path, mode) | |
| except PermissionError: | |
| pass # Silently ignore chmod errors on shared cache files | |
| os.chmod = _chmod_ignore_errors | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.gptq import GPTQModifier | |
| from llmcompressor.modifiers.quantization import QuantizationModifier | |
| from llmcompressor.modifiers.transform.awq import AWQModifier | |
| from llmcompressor.modifiers.transform.imatrix import IMatrixGatherer | |
| from llmcompressor.modifiers.transform.smoothquant import SmoothQuantModifier | |
| MODEL_CONFIGS = { | |
| "meta-llama/Meta-Llama-3-8B-Instruct": { | |
| "ignore": ["lm_head"], | |
| "is_moe": False, | |
| }, | |
| "Qwen/Qwen3-30B-A3B": { | |
| "ignore": ["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"], | |
| "is_moe": True, | |
| }, | |
| } | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| def build_recipe(technique, scheme, ignore, is_moe): | |
| if technique == "awq_rtn": | |
| duo = "both" if not is_moe else False | |
| return [ | |
| AWQModifier(duo_scaling=duo), | |
| QuantizationModifier( | |
| ignore=ignore, scheme=scheme, targets=["Linear"] | |
| ), | |
| ] | |
| elif technique == "rtn": | |
| return [ | |
| QuantizationModifier( | |
| ignore=ignore, scheme=scheme, targets=["Linear"] | |
| ), | |
| ] | |
| elif technique == "rtn_mse": | |
| scheme_obj = preset_name_to_scheme(scheme, ["Linear"]) | |
| scheme_obj.weights.observer = "memoryless_mse" | |
| return [ | |
| QuantizationModifier( | |
| config_groups={"group_0": scheme_obj}, | |
| ignore=ignore, | |
| ), | |
| ] | |
| elif technique == "gptq": | |
| recipe = [] | |
| if "W8A8" in scheme: | |
| recipe.append(SmoothQuantModifier(smoothing_strength=0.8)) | |
| recipe.append( | |
| GPTQModifier(ignore=ignore, scheme=scheme, targets=["Linear"]) | |
| ) | |
| return recipe | |
| elif technique == "imatrix": | |
| scheme_obj = preset_name_to_scheme(scheme, ["Linear"]) | |
| scheme_obj.weights.observer = "imatrix_mse" | |
| return [ | |
| IMatrixGatherer(ignore=ignore), | |
| QuantizationModifier( | |
| config_groups={"group_0": scheme_obj}, | |
| ignore=ignore, | |
| ), | |
| ] | |
| else: | |
| raise ValueError(f"Unknown technique: {technique}") | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--model", required=True) | |
| parser.add_argument( | |
| "--technique", required=True, | |
| choices=["awq_rtn", "rtn", "rtn_mse", "gptq", "imatrix"], | |
| ) | |
| parser.add_argument("--scheme", required=True) | |
| parser.add_argument("--save-dir", required=True) | |
| parser.add_argument("--num-samples", type=int, default=256) | |
| parser.add_argument("--max-seq-length", type=int, default=512) | |
| args = parser.parse_args() | |
| config = MODEL_CONFIGS.get(args.model) | |
| if config is None: | |
| raise ValueError( | |
| f"Unknown model: {args.model}. " | |
| f"Known models: {list(MODEL_CONFIGS.keys())}" | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained(args.model, dtype="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{args.num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=args.max_seq_length, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| recipe = build_recipe( | |
| args.technique, args.scheme, config["ignore"], config["is_moe"] | |
| ) | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot_kwargs = dict( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=args.max_seq_length, | |
| num_calibration_samples=args.num_samples, | |
| ) | |
| oneshot(**oneshot_kwargs) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Technique: {args.technique}, Scheme: {args.scheme}") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| model.save_pretrained(args.save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(args.save_dir) | |
| print(f"Model saved to {args.save_dir}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Observer Refactoring Regression Test Suite | |
| # Tests all combinations of models x techniques x schemes x branches. | |
| # For each combo, checks out main and the PR branch, reinstalls, quantizes, and evals. | |
| # | |
| # Usage: | |
| # ./run_all_tests.sh 2>&1 | tee regression_results.log | |
| # python extract_log_summary.py regression_results.log | |
| set -o pipefail | |
| # Use shared hub_cache directory on /raid/engine | |
| export HF_DATASETS_CACHE="/raid/engine/hub_cache" | |
| mkdir -p "$HF_DATASETS_CACHE" | |
| # ── Configuration ──────────────────────────────────────────────────────────── | |
| REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)" | |
| MODELS=("meta-llama/Meta-Llama-3-8B-Instruct" "Qwen/Qwen3-30B-A3B") | |
| MODEL_SHORT_NAMES=("Meta-Llama-3-8B-Instruct" "Qwen3-30B-A3B") | |
| # max_model_len,tensor_parallel_size,num_gpus_quant | |
| MODEL_VLLM_ARGS=("2048,1,1" "2048,2,1") | |
| TECHNIQUES=("awq_rtn" "rtn" "rtn_mse" "gptq" "imatrix") | |
| SCHEMES=("NVFP4") | |
| BRANCHES=("main" "90_refactor_obs") | |
| EVAL_NAMES=("gsm8k_platinum" "wikitext" "mmlu") | |
| EVAL_LM_TASKS=("gsm8k_platinum" "wikitext" "mmlu") | |
| EVAL_FEWSHOT=("5" "0" "5") | |
| EVAL_BACKENDS=("vllm" "vllm" "vllm") | |
| EVAL_BASE_DIR="./eval_results" | |
| MODEL_BASE_DIR="./regression_models" | |
| RESULTS_CSV="regression_results.csv" | |
| ORIGINAL_BRANCH=$(git -C "$REPO_DIR" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown") | |
| mkdir -p "$EVAL_BASE_DIR" "$MODEL_BASE_DIR" | |
| # ── Helper: activate environments ──────────────────────────────────────────── | |
| activate_quant_env() { | |
| source /home/HDCharles/rhdev/bin/activate | |
| } | |
| activate_eval_env() { | |
| source /home/HDCharles/vllm/bin/activate | |
| } | |
| # ── Helper: checkout branch and reinstall ──────────────────────────────────── | |
| switch_branch() { | |
| local branch=$1 | |
| echo " Switching to branch: $branch" | |
| git -C "$REPO_DIR" checkout "$branch" 2>&1 | |
| if [ $? -ne 0 ]; then | |
| echo " ERROR: git checkout $branch failed" | |
| return 1 | |
| fi | |
| activate_quant_env | |
| pip install -e "$REPO_DIR" 2>&1 | tail -1 | |
| echo " Installed llm-compressor from branch $branch" | |
| } | |
| # ── Helper: run vLLM evaluation with fallback chain ────────────────────────── | |
| EVAL_BACKEND="" | |
| run_vllm_eval() { | |
| local save_dir=$1 | |
| local task=$2 | |
| local num_fewshot=$3 | |
| local max_model_len=$4 | |
| local tp_size=$5 | |
| local eval_output_dir=$6 | |
| mkdir -p "$eval_output_dir" | |
| EVAL_BACKEND="FAILED" | |
| activate_eval_env | |
| echo " EVAL: $task (fewshot=$num_fewshot, tp=$tp_size, max_len=$max_model_len)" | |
| local chat_args="--apply_chat_template" | |
| if [ "$num_fewshot" -gt 0 ]; then | |
| chat_args="$chat_args --fewshot_as_multiturn" | |
| fi | |
| if [ "$tp_size" -gt 1 ]; then | |
| lm_eval \ | |
| --model vllm \ | |
| --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,tensor_parallel_size=$tp_size,gpu_memory_utilization=0.85" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp${tp_size}"; return 0; fi | |
| echo " TP=$tp_size failed, trying expert_parallel..." | |
| lm_eval \ | |
| --model vllm \ | |
| --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enable_expert_parallel=True,gpu_memory_utilization=0.85" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_expert_parallel"; return 0; fi | |
| fi | |
| echo " Trying TP=1..." | |
| lm_eval \ | |
| --model vllm \ | |
| --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,gpu_memory_utilization=0.85" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp1"; return 0; fi | |
| echo " Trying enforce_eager..." | |
| lm_eval \ | |
| --model vllm \ | |
| --model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enforce_eager=True,gpu_memory_utilization=0.85" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_eager"; return 0; fi | |
| echo " Trying hf backend as last resort..." | |
| lm_eval \ | |
| --model hf \ | |
| --model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi | |
| EVAL_BACKEND="FAILED" | |
| return 1 | |
| } | |
| # ── Helper: run HF-only evaluation ───────────────────────────────────────── | |
| run_hf_eval() { | |
| local save_dir=$1 | |
| local task=$2 | |
| local num_fewshot=$3 | |
| local eval_output_dir=$4 | |
| mkdir -p "$eval_output_dir" | |
| EVAL_BACKEND="FAILED" | |
| activate_eval_env | |
| echo " EVAL: $task (fewshot=$num_fewshot, backend=hf)" | |
| local chat_args="--apply_chat_template" | |
| if [ "$num_fewshot" -gt 0 ]; then | |
| chat_args="$chat_args --fewshot_as_multiturn" | |
| fi | |
| lm_eval \ | |
| --model hf \ | |
| --model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \ | |
| --tasks "$task" \ | |
| --num_fewshot "$num_fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_output_dir" 2>&1 | |
| if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi | |
| EVAL_BACKEND="FAILED" | |
| return 1 | |
| } | |
| # ── Helper: extract metric from lm_eval JSON results ──────────────────────── | |
| extract_metric() { | |
| local eval_output_dir=$1 | |
| local task=$2 | |
| local results_json | |
| results_json=$(find "$eval_output_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1) | |
| if [ -z "$results_json" ]; then | |
| echo "N/A" | |
| return | |
| fi | |
| python3 -c " | |
| import json, sys | |
| with open('$results_json') as f: | |
| data = json.load(f) | |
| results = data.get('results', {}) | |
| task = '$task' | |
| task_results = None | |
| for key in results: | |
| if task in key: | |
| task_results = results[key] | |
| break | |
| if task_results is None: | |
| print('N/A') | |
| sys.exit() | |
| if 'gsm8k' in task: | |
| val = task_results.get('exact_match,strict-match') | |
| if val is not None: | |
| print(f'{val*100:.2f}%') | |
| else: | |
| print('N/A') | |
| elif 'wikitext' in task: | |
| val = task_results.get('word_perplexity,none') | |
| if val is not None: | |
| print(f'{val:.2f}') | |
| else: | |
| print('N/A') | |
| elif 'mmlu' in task: | |
| val = task_results.get('acc,none') | |
| if val is not None: | |
| print(f'{val*100:.2f}%') | |
| else: | |
| print('N/A') | |
| else: | |
| for k, v in task_results.items(): | |
| if 'stderr' not in k and k != 'alias' and isinstance(v, (int, float)): | |
| print(f'{v:.4f}') | |
| sys.exit() | |
| print('N/A') | |
| " 2>/dev/null || echo "N/A" | |
| } | |
| # ── Helper: print current results summary ──────────────────────────────────── | |
| print_summary() { | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ RESULTS SUMMARY (so far) ║" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| if [ -f "$RESULTS_CSV" ]; then | |
| column -t -s',' < "$RESULTS_CSV" | |
| else | |
| echo "(no results yet)" | |
| fi | |
| echo "" | |
| echo "════════════════════════════════════════════════════════════════════════════════════════════════════════" | |
| echo "" | |
| } | |
| # ── Helper: print branch comparison table ───────────────────────────────── | |
| print_comparison() { | |
| if [ ! -f "$RESULTS_CSV" ]; then | |
| return | |
| fi | |
| python3 - "$RESULTS_CSV" <<'PYEOF' | |
| import csv, sys | |
| csv_path = sys.argv[1] | |
| rows = [] | |
| with open(csv_path) as f: | |
| reader = csv.DictReader(f) | |
| for r in reader: | |
| rows.append(r) | |
| if not rows: | |
| sys.exit() | |
| # Build lookup: (model, scheme, technique, task) -> {branch: metric} | |
| lookup = {} | |
| for r in rows: | |
| key = (r["model"], r["scheme"], r["technique"], r["task"]) | |
| lookup.setdefault(key, {}) | |
| lookup[key][r["branch"]] = r["metric"] | |
| entries = [(k, v) for k, v in lookup.items() | |
| if "main" in v and any(b != "main" for b in v)] | |
| if not entries: | |
| sys.exit() | |
| pr_branch = [b for b in next(iter(lookup.values())) if b != "main"] | |
| pr_branch = pr_branch[0] if pr_branch else "pr" | |
| def parse_metric(s): | |
| s = s.strip() | |
| if s.endswith("%"): | |
| return float(s[:-1]), True | |
| try: | |
| return float(s), False | |
| except ValueError: | |
| return None, False | |
| def calc_change(main_str, pr_str, task): | |
| m_val, _ = parse_metric(main_str) | |
| p_val, _ = parse_metric(pr_str) | |
| if m_val is None or p_val is None or m_val == 0: | |
| return "N/A" | |
| if "wikitext" in task: | |
| pct = (m_val - p_val) / m_val * 100 | |
| else: | |
| pct = (p_val - m_val) / m_val * 100 | |
| sign = "+" if pct >= 0 else "" | |
| return f"{sign}{pct:.2f}%" | |
| print("") | |
| print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗") | |
| print(f"║ BRANCH COMPARISON (main vs {pr_branch})") | |
| print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝") | |
| print("") | |
| header = (f"{'model':<30} {'scheme':<10} {'technique':<12} {'task':<18} " | |
| f"{'main':>14} {'PR':>14} {'change':>12}") | |
| print(header) | |
| print("-" * len(header)) | |
| for (model, scheme, technique, task), metrics in sorted(entries): | |
| m = metrics.get("main", "") | |
| p = metrics.get(pr_branch, "") | |
| change = calc_change(m, p, task) if m and p else "" | |
| print(f"{model:<30} {scheme:<10} {technique:<12} {task:<18} " | |
| f"{m:>14} {p:>14} {change:>12}") | |
| print("") | |
| PYEOF | |
| } | |
| # ── Initialize results CSV ────────────────────────────────────────────────── | |
| if [ -f "$RESULTS_CSV" ]; then | |
| cp "$RESULTS_CSV" "${RESULTS_CSV}.bak" | |
| fi | |
| echo "model,scheme,technique,branch,task,metric,status,eval_backend,save_dir" > "$RESULTS_CSV" | |
| # ── Main loop ──────────────────────────────────────────────────────────────── | |
| TOTAL=0 | |
| PASSED=0 | |
| FAILED=0 | |
| for model_idx in "${!MODELS[@]}"; do | |
| model="${MODELS[$model_idx]}" | |
| model_short="${MODEL_SHORT_NAMES[$model_idx]}" | |
| IFS=',' read -r max_model_len tp_size num_gpus_quant <<< "${MODEL_VLLM_ARGS[$model_idx]}" | |
| for technique in "${TECHNIQUES[@]}"; do | |
| for scheme in "${SCHEMES[@]}"; do | |
| for branch in "${BRANCHES[@]}"; do | |
| save_dir="$MODEL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}" | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ MODEL: $model_short" | |
| echo "║ SCHEME: $scheme" | |
| echo "║ TECHNIQUE: $technique" | |
| echo "║ BRANCH: $branch" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| # ── Skip entirely if all evals already have results ──── | |
| all_evals_cached=true | |
| for eval_idx in "${!EVAL_NAMES[@]}"; do | |
| eval_dir="$EVAL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}/${EVAL_NAMES[$eval_idx]}" | |
| if ! find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | grep -q .; then | |
| all_evals_cached=false | |
| break | |
| fi | |
| done | |
| if [ "$all_evals_cached" = true ]; then | |
| echo "All evals already cached, skipping quantization and eval." | |
| for eval_idx in "${!EVAL_NAMES[@]}"; do | |
| eval_name="${EVAL_NAMES[$eval_idx]}" | |
| lm_task="${EVAL_LM_TASKS[$eval_idx]}" | |
| eval_dir="$EVAL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}/${eval_name}" | |
| metric_val=$(extract_metric "$eval_dir" "$lm_task") | |
| echo " $eval_name: $metric_val" | |
| echo "$model_short,$scheme,$technique,$branch,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV" | |
| PASSED=$((PASSED + 1)) | |
| TOTAL=$((TOTAL + 1)) | |
| done | |
| print_summary | |
| print_comparison | |
| continue | |
| fi | |
| # ── Switch branch and reinstall ─────────────────────── | |
| switch_branch "$branch" | |
| if [ $? -ne 0 ]; then | |
| echo "BRANCH SWITCH FAILED for $branch" | |
| for eval_name in "${EVAL_NAMES[@]}"; do | |
| echo "$model_short,$scheme,$technique,$branch,$eval_name,N/A,BRANCH_FAILED,N/A,$save_dir" >> "$RESULTS_CSV" | |
| done | |
| FAILED=$((FAILED + ${#EVAL_NAMES[@]})) | |
| TOTAL=$((TOTAL + ${#EVAL_NAMES[@]})) | |
| print_summary | |
| continue | |
| fi | |
| # ── Quantize (skip if model already exists) ──────────── | |
| if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then | |
| echo "Quantized model already exists at $save_dir, skipping quantization." | |
| else | |
| activate_quant_env | |
| echo "============================================" | |
| echo "Running: quantize.py (model=$model_short, technique=$technique, scheme=$scheme, branch=$branch)" | |
| echo "============================================" | |
| if [ "$num_gpus_quant" -gt 1 ]; then | |
| torchrun --nproc_per_node="$num_gpus_quant" "$REPO_DIR/testing/quantize.py" \ | |
| --model "$model" --technique "$technique" --scheme "$scheme" \ | |
| --save-dir "$save_dir" 2>&1 | |
| else | |
| python "$REPO_DIR/testing/quantize.py" \ | |
| --model "$model" --technique "$technique" --scheme "$scheme" \ | |
| --save-dir "$save_dir" 2>&1 | |
| fi | |
| quant_status=$? | |
| if [ $quant_status -ne 0 ]; then | |
| echo "QUANTIZATION FAILED for $model_short / $scheme / $technique / $branch" | |
| for eval_name in "${EVAL_NAMES[@]}"; do | |
| echo "$model_short,$scheme,$technique,$branch,$eval_name,N/A,QUANT_FAILED,N/A,$save_dir" >> "$RESULTS_CSV" | |
| done | |
| FAILED=$((FAILED + ${#EVAL_NAMES[@]})) | |
| TOTAL=$((TOTAL + ${#EVAL_NAMES[@]})) | |
| print_summary | |
| print_comparison | |
| continue | |
| fi | |
| fi | |
| # ── Clear GPU memory before eval ───────────────────────── | |
| python3 -c "import torch; torch.cuda.empty_cache(); [torch.cuda.reset_peak_memory_stats(i) for i in range(torch.cuda.device_count())]" 2>/dev/null | |
| # ── Evaluate ───────────────────────────────────────────── | |
| for eval_idx in "${!EVAL_NAMES[@]}"; do | |
| eval_name="${EVAL_NAMES[$eval_idx]}" | |
| lm_task="${EVAL_LM_TASKS[$eval_idx]}" | |
| fewshot="${EVAL_FEWSHOT[$eval_idx]}" | |
| backend="${EVAL_BACKENDS[$eval_idx]}" | |
| eval_dir="$EVAL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}/${eval_name}" | |
| TOTAL=$((TOTAL + 1)) | |
| # Skip eval if results already exist | |
| existing_result=$(find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1) | |
| if [ -n "$existing_result" ]; then | |
| metric_val=$(extract_metric "$eval_dir" "$lm_task") | |
| echo " EVAL: $eval_name — skipping, previous result found: $metric_val" | |
| echo "$model_short,$scheme,$technique,$branch,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV" | |
| PASSED=$((PASSED + 1)) | |
| continue | |
| fi | |
| if [ "$backend" == "hf" ]; then | |
| run_hf_eval "$save_dir" "$lm_task" "$fewshot" "$eval_dir" | |
| else | |
| run_vllm_eval "$save_dir" "$lm_task" "$fewshot" "$max_model_len" "$tp_size" "$eval_dir" | |
| fi | |
| eval_status=$? | |
| if [ $eval_status -eq 0 ]; then | |
| metric_val=$(extract_metric "$eval_dir" "$lm_task") | |
| echo "PASS EVAL" | |
| echo "$model_short,$scheme,$technique,$branch,$eval_name,$metric_val,PASSED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV" | |
| PASSED=$((PASSED + 1)) | |
| else | |
| echo "FAIL EVAL" | |
| echo "$model_short,$scheme,$technique,$branch,$eval_name,N/A,FAILED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV" | |
| FAILED=$((FAILED + 1)) | |
| fi | |
| done | |
| # ── Clean up model to free disk space ──────────────────── | |
| if [ -d "$save_dir" ]; then | |
| echo "Removing quantized model at $save_dir to free disk space." | |
| rm -rf "$save_dir" | |
| fi | |
| print_summary | |
| print_comparison | |
| done # branch | |
| done # scheme | |
| done # technique | |
| done # model | |
| # ── Restore original branch ───────────────────────────────────────────────── | |
| echo "Restoring original branch: $ORIGINAL_BRANCH" | |
| git -C "$REPO_DIR" checkout "$ORIGINAL_BRANCH" 2>&1 | |
| activate_quant_env | |
| pip install -e "$REPO_DIR" 2>&1 | tail -1 | |
| # ── Final Summary ──────────────────────────────────────────────────────────── | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ FINAL SUMMARY: $PASSED passed, $FAILED failed out of $TOTAL total evaluations ║" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| print_summary | |
| print_comparison | |
| echo "Results CSV: $RESULTS_CSV" | |
| echo "Eval outputs: $EVAL_BASE_DIR/" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment