Last active
May 5, 2026 17:17
-
-
Save HDCharles/a8ece53d76ce89ce81ddeed4fba3aa28 to your computer and use it in GitHub Desktop.
Updated parallel regression test with chg GPU detection and FP8 support
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Parallel Regression Test Script | |
| # 1. Quantizes remaining models (if needed) | |
| # 2. Runs evaluations in parallel (4 at a time) | |
| # 3. Saves individual logs for each eval job | |
| # 4. Prints summaries as jobs complete | |
| set -o pipefail | |
| # ── Configuration ──────────────────────────────────────────────────────────── | |
| REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)" | |
| export HF_DATASETS_CACHE="$HOME/hf_hub" | |
| mkdir -p "$HF_DATASETS_CACHE" | |
| MODEL_BASE_DIR="$HOME/hf_hub/regression_models" | |
| EVAL_BASE_DIR="./eval_results" | |
| EVAL_LOGS_DIR="./eval_logs" | |
| RESULTS_CSV="parallel_regression_results.csv" | |
| # Models to test with their VLLM args (max_model_len,tp_size) | |
| declare -A MODELS=( | |
| ["Qwen/Qwen2.5-3B-Instruct"]="Qwen2.5-3B-Instruct,2048,1" | |
| ["meta-llama/Meta-Llama-3-8B-Instruct"]="Meta-Llama-3-8B-Instruct,2048,1" | |
| ["Qwen/Qwen3-30B-A3B"]="Qwen3-30B-A3B,2048,2" | |
| ) | |
| TECHNIQUES=("awq_rtn" "rtn" "rtn_mse" "gptq" "imatrix") | |
| BRANCHES=("main" "90_refactor_obs") | |
| SCHEMES=("NVFP4" "FP8") | |
| EVAL_TASKS=("wikitext" "mmlu") | |
| EVAL_LM_TASKS=("wikitext" "mmlu") | |
| EVAL_FEWSHOT=("0" "5") | |
| # Parallel config (both quantization and evaluation) | |
| MAX_PARALLEL_JOBS=4 | |
| mkdir -p "$EVAL_BASE_DIR" "$MODEL_BASE_DIR" "$EVAL_LOGS_DIR" | |
| # Detect GPUs reserved by current user via chg | |
| detect_reserved_gpus() { | |
| local current_user=$(whoami) | |
| # Parse chg status to find GPUs reserved by current user | |
| # Strip ANSI codes, parse table, find IN_USE rows with current user | |
| local reserved=$(chg status 2>/dev/null | sed 's/\x1b\[[0-9;]*m//g' | \ | |
| awk -F '│' -v user="$current_user" 'NR > 2 && $3 ~ user && $2 ~ /IN_USE/ { | |
| gsub(/^[ \t]+|[ \t]+$/, "", $1); | |
| print $1 | |
| }') | |
| echo "$reserved" | |
| } | |
| echo "Detecting reserved GPUs via chg status..." | |
| AVAILABLE_GPUS=($(detect_reserved_gpus)) | |
| if [ ${#AVAILABLE_GPUS[@]} -eq 0 ]; then | |
| echo "ERROR: No GPUs reserved. Please reserve GPUs using 'chg reserve <gpu_ids>' first." | |
| echo "Example: chg reserve 0,1,2,3" | |
| exit 1 | |
| fi | |
| echo "Reserved GPUs detected: ${AVAILABLE_GPUS[@]}" | |
| echo "Will use up to $MAX_PARALLEL_JOBS parallel jobs" | |
| echo "" | |
| # GPU allocation tracking | |
| declare -A GPU_IN_USE | |
| for gpu in "${AVAILABLE_GPUS[@]}"; do | |
| GPU_IN_USE[$gpu]=0 | |
| done | |
| # Helper: get next available GPU from reserved pool | |
| # Sets ALLOCATED_GPU to the GPU ID, or empty string if none available | |
| get_free_gpu() { | |
| ALLOCATED_GPU="" | |
| # Show internal tracking status | |
| local internal_status="" | |
| for gpu in "${AVAILABLE_GPUS[@]}"; do | |
| internal_status+="GPU$gpu:${GPU_IN_USE[$gpu]} " | |
| done | |
| echo "[DEBUG] Reserved GPU tracking: $internal_status" | |
| # Find a GPU that's not currently allocated by us | |
| for gpu in "${AVAILABLE_GPUS[@]}"; do | |
| if [ "${GPU_IN_USE[$gpu]}" -eq 0 ]; then | |
| # GPU is available, claim it | |
| GPU_IN_USE[$gpu]=1 | |
| ALLOCATED_GPU=$gpu | |
| echo "[DEBUG] Allocated GPU $gpu" | |
| return 0 | |
| fi | |
| done | |
| # No GPU available | |
| echo "[DEBUG] No free GPUs available (all currently allocated by script)" | |
| return 1 | |
| } | |
| # Helper: release GPU | |
| release_gpu() { | |
| local gpu=$1 | |
| GPU_IN_USE[$gpu]=0 | |
| } | |
| # ── Helper: activate environments ──────────────────────────────────────────── | |
| activate_quant_env() { | |
| source /home/HDCharles/rhdev/bin/activate | |
| } | |
| activate_eval_env() { | |
| source /home/HDCharles/vllm/bin/activate | |
| } | |
| # ── Helper: checkout branch and reinstall ──────────────────────────────────── | |
| switch_branch() { | |
| local branch=$1 | |
| echo " Switching to branch: $branch" | |
| git -C "$REPO_DIR" checkout "$branch" 2>&1 | tail -5 | |
| if [ $? -ne 0 ]; then | |
| echo " ERROR: git checkout $branch failed" | |
| return 1 | |
| fi | |
| activate_quant_env | |
| pip install -e "$REPO_DIR" 2>&1 | tail -1 | |
| echo " Installed llm-compressor from branch $branch" | |
| } | |
| # ── Helper: quantize a model ───────────────────────────────────────────────── | |
| quantize_model() { | |
| local model=$1 | |
| local model_short=$2 | |
| local technique=$3 | |
| local scheme=$4 | |
| local branch=$5 | |
| local save_dir=$6 | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ QUANTIZING: $model_short / $technique / $scheme / $branch" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then | |
| echo "Quantized model already exists at $save_dir, skipping." | |
| return 0 | |
| fi | |
| activate_quant_env | |
| python "$REPO_DIR/testing/quantize.py" \ | |
| --model "$model" \ | |
| --technique "$technique" \ | |
| --scheme "$scheme" \ | |
| --save-dir "$save_dir" 2>&1 | |
| if [ $? -ne 0 ]; then | |
| echo "QUANTIZATION FAILED for $model_short / $scheme / $technique / $branch" | |
| return 1 | |
| fi | |
| echo "Model saved to $save_dir" | |
| return 0 | |
| } | |
| # ── Helper: run single evaluation (called in background) ──────────────────── | |
| run_single_eval() { | |
| local model_short=$1 | |
| local scheme=$2 | |
| local technique=$3 | |
| local branch=$4 | |
| local task_name=$5 | |
| local lm_task=$6 | |
| local fewshot=$7 | |
| local save_dir=$8 | |
| local eval_dir=$9 | |
| local log_file="${10}" | |
| { | |
| echo "════════════════════════════════════════════════════════════════" | |
| echo "EVAL START: $model_short / $technique / $branch / $task_name" | |
| echo "Task: $lm_task, Fewshot: $fewshot" | |
| echo "════════════════════════════════════════════════════════════════" | |
| echo "" | |
| mkdir -p "$eval_dir" | |
| activate_eval_env | |
| local result="FAILED" | |
| local backend="FAILED" | |
| # Try HF backend for NVFP4 | |
| if [ "$scheme" == "NVFP4" ]; then | |
| echo "Using HF backend for NVFP4..." | |
| local chat_args="--apply_chat_template" | |
| if [ "$fewshot" -gt 0 ]; then | |
| chat_args="$chat_args --fewshot_as_multiturn" | |
| fi | |
| lm_eval \ | |
| --model hf \ | |
| --model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \ | |
| --tasks "$lm_task" \ | |
| --num_fewshot "$fewshot" \ | |
| --batch_size auto \ | |
| $chat_args \ | |
| --output_path "$eval_dir" 2>&1 | |
| if [ $? -eq 0 ]; then | |
| result="PASSED" | |
| backend="hf" | |
| fi | |
| fi | |
| echo "" | |
| echo "════════════════════════════════════════════════════════════════" | |
| echo "EVAL COMPLETE: $result" | |
| echo "Backend: $backend" | |
| echo "════════════════════════════════════════════════════════════════" | |
| # Return status via exit code | |
| if [ "$result" == "PASSED" ]; then | |
| exit 0 | |
| else | |
| exit 1 | |
| fi | |
| } &> "$log_file" | |
| return $? | |
| } | |
| # ── Helper: extract metric from eval results ──────────────────────────────── | |
| extract_metric() { | |
| local eval_output_dir=$1 | |
| local task=$2 | |
| local results_json | |
| results_json=$(find "$eval_output_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1) | |
| if [ -z "$results_json" ]; then | |
| echo "N/A" | |
| return | |
| fi | |
| python3 -c " | |
| import json, sys | |
| with open('$results_json') as f: | |
| data = json.load(f) | |
| results = data.get('results', {}) | |
| task = '$task' | |
| task_results = None | |
| for key in results: | |
| if task in key: | |
| task_results = results[key] | |
| break | |
| if task_results is None: | |
| print('N/A') | |
| sys.exit() | |
| if 'gsm8k' in task: | |
| val = task_results.get('exact_match,strict-match') | |
| if val is not None: | |
| print(f'{val*100:.2f}%') | |
| else: | |
| print('N/A') | |
| elif 'wikitext' in task: | |
| val = task_results.get('word_perplexity,none') | |
| if val is not None: | |
| print(f'{val:.2f}') | |
| else: | |
| print('N/A') | |
| elif 'mmlu' in task: | |
| val = task_results.get('acc,none') | |
| if val is not None: | |
| print(f'{val*100:.2f}%') | |
| else: | |
| print('N/A') | |
| else: | |
| for k, v in task_results.items(): | |
| if 'stderr' not in k and k != 'alias' and isinstance(v, (int, float)): | |
| print(f'{v:.4f}') | |
| sys.exit() | |
| print('N/A') | |
| " 2>/dev/null || echo "N/A" | |
| } | |
| # ── Helper: print comparison summary ───────────────────────────────────────── | |
| print_comparison() { | |
| if [ ! -f "$RESULTS_CSV" ]; then | |
| return | |
| fi | |
| python3 - "$RESULTS_CSV" <<'PYEOF' | |
| import csv, sys | |
| csv_path = sys.argv[1] | |
| rows = [] | |
| with open(csv_path) as f: | |
| reader = csv.DictReader(f) | |
| for r in reader: | |
| if r.get('status') in ['PASSED', 'CACHED']: | |
| rows.append(r) | |
| if not rows: | |
| sys.exit() | |
| # Build lookup: (model, scheme, technique, task) -> {branch: metric} | |
| lookup = {} | |
| for r in rows: | |
| key = (r["model"], r["scheme"], r["technique"], r["task"]) | |
| lookup.setdefault(key, {}) | |
| lookup[key][r["branch"]] = r["metric"] | |
| entries = [(k, v) for k, v in lookup.items() | |
| if "main" in v and any(b != "main" for b in v)] | |
| if not entries: | |
| sys.exit() | |
| pr_branch = [b for b in next(iter(lookup.values())) if b != "main"] | |
| pr_branch = pr_branch[0] if pr_branch else "pr" | |
| def parse_metric(s): | |
| s = s.strip() | |
| if s.endswith("%"): | |
| try: | |
| return float(s[:-1]), True | |
| except ValueError: | |
| return None, False | |
| try: | |
| return float(s), False | |
| except ValueError: | |
| return None, False | |
| def calc_change(main_str, pr_str, task): | |
| m_val, _ = parse_metric(main_str) | |
| p_val, _ = parse_metric(pr_str) | |
| if m_val is None or p_val is None or m_val == 0: | |
| return "N/A" | |
| if "wikitext" in task: | |
| pct = (m_val - p_val) / m_val * 100 | |
| else: | |
| pct = (p_val - m_val) / m_val * 100 | |
| sign = "+" if pct >= 0 else "" | |
| return f"{sign}{pct:.2f}%" | |
| print("") | |
| print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗") | |
| print(f"║ BRANCH COMPARISON (main vs {pr_branch})") | |
| print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝") | |
| print("") | |
| header = (f"{'model':<30} {'scheme':<10} {'technique':<12} {'task':<18} " | |
| f"{'main':>14} {'PR':>14} {'change':>12}") | |
| print(header) | |
| print("-" * len(header)) | |
| for (model, scheme, technique, task), metrics in sorted(entries): | |
| m = metrics.get("main", "") | |
| p = metrics.get(pr_branch, "") | |
| change = calc_change(m, p, task) if m and p else "" | |
| print(f"{model:<30} {scheme:<10} {technique:<12} {task:<18} " | |
| f"{m:>14} {p:>14} {change:>12}") | |
| print("") | |
| PYEOF | |
| } | |
| # ── Helper: run quantization in background ────────────────────────────────── | |
| run_quantize_job() { | |
| local model=$1 | |
| local model_short=$2 | |
| local technique=$3 | |
| local scheme=$4 | |
| local branch=$5 | |
| local save_dir=$6 | |
| local log_file=$7 | |
| { | |
| echo "════════════════════════════════════════════════════════════════" | |
| echo "QUANT START: $model_short / $technique / $branch" | |
| echo "════════════════════════════════════════════════════════════════" | |
| echo "" | |
| activate_quant_env | |
| python "$REPO_DIR/testing/quantize.py" \ | |
| --model "$model" \ | |
| --technique "$technique" \ | |
| --scheme "$scheme" \ | |
| --save-dir "$save_dir" 2>&1 | |
| if [ $? -eq 0 ]; then | |
| echo "" | |
| echo "════════════════════════════════════════════════════════════════" | |
| echo "QUANT COMPLETE: SUCCESS" | |
| echo "Model saved to $save_dir" | |
| echo "════════════════════════════════════════════════════════════════" | |
| exit 0 | |
| else | |
| echo "" | |
| echo "════════════════════════════════════════════════════════════════" | |
| echo "QUANT COMPLETE: FAILED" | |
| echo "════════════════════════════════════════════════════════════════" | |
| exit 1 | |
| fi | |
| } &> "$log_file" | |
| return $? | |
| } | |
| # ── Step 1: Parallel Quantization ─────────────────────────────────────────── | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ STEP 1: PARALLEL QUANTIZATION (${MAX_PARALLEL_JOBS} jobs at a time) ║" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| # Build list of quantization jobs needed | |
| declare -a QUANT_JOBS | |
| TOTAL_SKIPPED=0 | |
| for model_key in "${!MODELS[@]}"; do | |
| IFS=',' read -r model_short max_len tp_size <<< "${MODELS[$model_key]}" | |
| for scheme in "${SCHEMES[@]}"; do | |
| for technique in "${TECHNIQUES[@]}"; do | |
| for branch in "${BRANCHES[@]}"; do | |
| save_dir="$MODEL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}" | |
| if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then | |
| TOTAL_SKIPPED=$((TOTAL_SKIPPED + 1)) | |
| echo " SKIP: $model_short / $scheme / $technique / $branch (already exists)" | |
| continue | |
| fi | |
| # Add to quantization queue | |
| QUANT_JOBS+=("$model_key|$model_short|$scheme|$technique|$branch|$save_dir") | |
| done | |
| done | |
| done | |
| done | |
| # Sort jobs by branch to batch them efficiently | |
| if [ ${#QUANT_JOBS[@]} -gt 0 ]; then | |
| # Separate into main and other branches | |
| declare -a MAIN_JOBS | |
| declare -a OTHER_JOBS | |
| for job_info in "${QUANT_JOBS[@]}"; do | |
| IFS='|' read -r _ _ _ _ branch _ <<< "$job_info" | |
| if [ "$branch" == "main" ]; then | |
| MAIN_JOBS+=("$job_info") | |
| else | |
| OTHER_JOBS+=("$job_info") | |
| fi | |
| done | |
| # Rebuild QUANT_JOBS with main first, then others | |
| QUANT_JOBS=("${MAIN_JOBS[@]}" "${OTHER_JOBS[@]}") | |
| fi | |
| echo "" | |
| echo "Already quantized: $TOTAL_SKIPPED models" | |
| echo "Quantization jobs to run: ${#QUANT_JOBS[@]}" | |
| echo "" | |
| if [ ${#QUANT_JOBS[@]} -gt 0 ]; then | |
| echo "Models to be quantized:" | |
| echo "────────────────────────────────────────────────────────────────" | |
| for job_info in "${QUANT_JOBS[@]}"; do | |
| IFS='|' read -r _ model_short scheme technique branch _ <<< "$job_info" | |
| echo " • $model_short / $scheme / $technique / $branch" | |
| done | |
| echo "────────────────────────────────────────────────────────────────" | |
| echo "" | |
| fi | |
| # Run quantization jobs in parallel | |
| declare -a QUANT_PIDS | |
| declare -a QUANT_LOGS | |
| declare -a QUANT_INFO | |
| declare -a QUANT_GPUS | |
| quant_idx=0 | |
| quant_completed=0 | |
| quant_failed=0 | |
| # Switch to first branch for initial setup | |
| if [ ${#QUANT_JOBS[@]} -gt 0 ]; then | |
| IFS='|' read -r _ _ _ _ first_branch _ <<< "${QUANT_JOBS[0]}" | |
| switch_branch "$first_branch" | |
| fi | |
| while [ $quant_idx -lt ${#QUANT_JOBS[@]} ] || [ ${#QUANT_PIDS[@]} -gt 0 ]; do | |
| # Start new jobs if we have capacity and a GPU is available | |
| while [ ${#QUANT_PIDS[@]} -lt $MAX_PARALLEL_JOBS ] && [ $quant_idx -lt ${#QUANT_JOBS[@]} ]; do | |
| # Try to get a free GPU | |
| get_free_gpu | |
| if [ -z "$ALLOCATED_GPU" ]; then | |
| break # No GPUs available, wait | |
| fi | |
| gpu=$ALLOCATED_GPU | |
| # Parse job info | |
| job_info="${QUANT_JOBS[$quant_idx]}" | |
| IFS='|' read -r model_key model_short scheme technique branch save_dir <<< "$job_info" | |
| # Check if we need to switch branches | |
| current_branch=$(git -C "$REPO_DIR" rev-parse --abbrev-ref HEAD 2>/dev/null) | |
| if [ "$current_branch" != "$branch" ]; then | |
| # Wait for all running jobs to finish before switching | |
| if [ ${#QUANT_PIDS[@]} -gt 0 ]; then | |
| release_gpu "$gpu" | |
| break | |
| fi | |
| switch_branch "$branch" | |
| fi | |
| # Create log file | |
| timestamp=$(date +%Y%m%d-%H%M%S) | |
| log_file="$EVAL_LOGS_DIR/${timestamp}_QUANT_${model_short}_${scheme}_${technique}_${branch}.log" | |
| echo "Starting quant job $((quant_idx + 1))/${#QUANT_JOBS[@]} on GPU $gpu: $model_short / $scheme / $technique / $branch" | |
| echo " Log: $log_file" | |
| # Start background job with specific GPU | |
| CUDA_VISIBLE_DEVICES=$gpu bash -c " | |
| source /home/HDCharles/rhdev/bin/activate | |
| python '$REPO_DIR/testing/quantize.py' \ | |
| --model '$model_key' \ | |
| --technique '$technique' \ | |
| --scheme '$scheme' \ | |
| --save-dir '$save_dir' 2>&1 | |
| " &> "$log_file" & | |
| pid=$! | |
| QUANT_PIDS+=($pid) | |
| QUANT_LOGS+=("$log_file") | |
| QUANT_INFO+=("$model_short|$scheme|$technique|$branch|$save_dir") | |
| QUANT_GPUS+=($gpu) | |
| quant_idx=$((quant_idx + 1)) | |
| done | |
| # Check for completed jobs | |
| new_pids=() | |
| new_logs=() | |
| new_info=() | |
| new_gpus=() | |
| for i in "${!QUANT_PIDS[@]}"; do | |
| pid="${QUANT_PIDS[$i]}" | |
| if kill -0 "$pid" 2>/dev/null; then | |
| # Still running, keep it | |
| new_pids+=("$pid") | |
| new_logs+=("${QUANT_LOGS[$i]}") | |
| new_info+=("${QUANT_INFO[$i]}") | |
| new_gpus+=("${QUANT_GPUS[$i]}") | |
| else | |
| # Job finished | |
| wait "$pid" 2>/dev/null | |
| exit_code=$? | |
| log_file="${QUANT_LOGS[$i]}" | |
| gpu="${QUANT_GPUS[$i]}" | |
| IFS='|' read -r model_short scheme technique branch save_dir <<< "${QUANT_INFO[$i]}" | |
| # Release GPU | |
| release_gpu "$gpu" | |
| if [ $exit_code -eq 0 ]; then | |
| echo "" | |
| echo "✓ QUANT COMPLETED: $model_short / $scheme / $technique / $branch (GPU $gpu freed)" | |
| echo " Saved to: $save_dir" | |
| echo " Log: $log_file" | |
| echo "" | |
| quant_completed=$((quant_completed + 1)) | |
| else | |
| echo "" | |
| echo "✗ QUANT FAILED: $model_short / $scheme / $technique / $branch (GPU $gpu freed)" | |
| echo " Log: $log_file" | |
| echo "" | |
| quant_failed=$((quant_failed + 1)) | |
| fi | |
| fi | |
| done | |
| # Update arrays | |
| QUANT_PIDS=("${new_pids[@]}") | |
| QUANT_LOGS=("${new_logs[@]}") | |
| QUANT_INFO=("${new_info[@]}") | |
| QUANT_GPUS=("${new_gpus[@]}") | |
| sleep 5 | |
| done | |
| echo "" | |
| echo "Quantization phase complete:" | |
| echo " Completed: $quant_completed" | |
| echo " Failed: $quant_failed" | |
| echo " Skipped: $TOTAL_SKIPPED (already existed)" | |
| echo "" | |
| # ── Step 2: Parallel Evaluation ───────────────────────────────────────────── | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ STEP 2: PARALLEL EVALUATION (${MAX_PARALLEL_JOBS} jobs at a time) ║" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| # Initialize results CSV | |
| echo "model,scheme,technique,branch,task,metric,status,backend,save_dir" > "$RESULTS_CSV" | |
| # Build list of all eval jobs | |
| declare -a EVAL_JOBS | |
| # Process models in order: smallest to largest (3B, 8B, 30B) | |
| MODEL_ORDER=("Qwen/Qwen2.5-3B-Instruct" "meta-llama/Meta-Llama-3-8B-Instruct" "Qwen/Qwen3-30B-A3B") | |
| for model_key in "${MODEL_ORDER[@]}"; do | |
| # Skip if model not in MODELS array | |
| if [ -z "${MODELS[$model_key]}" ]; then | |
| continue | |
| fi | |
| IFS=',' read -r model_short max_len tp_size <<< "${MODELS[$model_key]}" | |
| for scheme in "${SCHEMES[@]}"; do | |
| for technique in "${TECHNIQUES[@]}"; do | |
| for branch in "${BRANCHES[@]}"; do | |
| save_dir="$MODEL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}" | |
| # Skip if model doesn't exist | |
| if [ ! -d "$save_dir" ] || [ ! -f "$save_dir/config.json" ]; then | |
| continue | |
| fi | |
| for eval_idx in "${!EVAL_TASKS[@]}"; do | |
| task_name="${EVAL_TASKS[$eval_idx]}" | |
| lm_task="${EVAL_LM_TASKS[$eval_idx]}" | |
| fewshot="${EVAL_FEWSHOT[$eval_idx]}" | |
| eval_dir="$EVAL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}/${task_name}" | |
| # Skip if results already exist | |
| if find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | grep -q .; then | |
| metric_val=$(extract_metric "$eval_dir" "$lm_task") | |
| echo "$model_short,$scheme,$technique,$branch,$task_name,$metric_val,CACHED,cached,$save_dir" >> "$RESULTS_CSV" | |
| echo " CACHED: $model_short / $scheme / $technique / $branch / $task_name = $metric_val" | |
| continue | |
| fi | |
| # Add to job queue (include max_len and tp_size for vLLM) | |
| EVAL_JOBS+=("$save_dir|$model_short|$scheme|$technique|$branch|$task_name|$lm_task|$fewshot|$eval_dir|$max_len|$tp_size") | |
| done | |
| done | |
| done | |
| done | |
| done | |
| echo "" | |
| echo "Total evaluation jobs to run: ${#EVAL_JOBS[@]}" | |
| echo "" | |
| # Run evaluations in parallel | |
| declare -a RUNNING_PIDS | |
| declare -a RUNNING_LOGS | |
| declare -a RUNNING_INFO | |
| declare -a RUNNING_GPUS | |
| job_idx=0 | |
| completed_count=0 | |
| failed_count=0 | |
| while [ $job_idx -lt ${#EVAL_JOBS[@]} ] || [ ${#RUNNING_PIDS[@]} -gt 0 ]; do | |
| # Start new jobs if we have capacity and a GPU is available | |
| while [ ${#RUNNING_PIDS[@]} -lt $MAX_PARALLEL_JOBS ] && [ $job_idx -lt ${#EVAL_JOBS[@]} ]; do | |
| # Try to get a free GPU | |
| get_free_gpu | |
| if [ -z "$ALLOCATED_GPU" ]; then | |
| break # No GPUs available, wait | |
| fi | |
| gpu=$ALLOCATED_GPU | |
| # Parse job info | |
| job_info="${EVAL_JOBS[$job_idx]}" | |
| IFS='|' read -r save_dir model_short scheme technique branch task_name lm_task fewshot eval_dir max_len tp_size <<< "$job_info" | |
| # Create log file | |
| timestamp=$(date +%Y%m%d-%H%M%S) | |
| log_file="$EVAL_LOGS_DIR/${timestamp}_${model_short}_${scheme}_${technique}_${branch}_${task_name}.log" | |
| echo "Starting job $((job_idx + 1))/${#EVAL_JOBS[@]} on GPU $gpu: $model_short / $scheme / $technique / $branch / $task_name" | |
| echo " Log: $log_file" | |
| # Choose backend based on scheme | |
| if [ "$scheme" == "FP8" ]; then | |
| # Use vLLM for FP8 (always TP=1 for single-GPU async handling) | |
| backend="vllm" | |
| CUDA_VISIBLE_DEVICES=$gpu bash -c " | |
| source /home/HDCharles/vllm/bin/activate | |
| mkdir -p '$eval_dir' | |
| chat_args='--apply_chat_template' | |
| if [ '$fewshot' -gt 0 ]; then | |
| chat_args=\"\$chat_args --fewshot_as_multiturn\" | |
| fi | |
| lm_eval \ | |
| --model vllm \ | |
| --model_args 'pretrained=$save_dir,dtype=auto,max_model_len=$max_len,add_bos_token=True,gpu_memory_utilization=0.85' \ | |
| --tasks '$lm_task' \ | |
| --num_fewshot '$fewshot' \ | |
| --batch_size auto \ | |
| \$chat_args \ | |
| --output_path '$eval_dir' 2>&1 | |
| " &> "$log_file" & | |
| else | |
| # Use HF for NVFP4 | |
| backend="hf" | |
| CUDA_VISIBLE_DEVICES=$gpu bash -c " | |
| source /home/HDCharles/vllm/bin/activate | |
| mkdir -p '$eval_dir' | |
| chat_args='--apply_chat_template' | |
| if [ '$fewshot' -gt 0 ]; then | |
| chat_args=\"\$chat_args --fewshot_as_multiturn\" | |
| fi | |
| lm_eval \ | |
| --model hf \ | |
| --model_args 'pretrained=$save_dir,dtype=auto,add_bos_token=True' \ | |
| --tasks '$lm_task' \ | |
| --num_fewshot '$fewshot' \ | |
| --batch_size auto \ | |
| \$chat_args \ | |
| --output_path '$eval_dir' 2>&1 | |
| " &> "$log_file" & | |
| fi | |
| pid=$! | |
| RUNNING_PIDS+=($pid) | |
| RUNNING_LOGS+=("$log_file") | |
| RUNNING_INFO+=("$model_short|$scheme|$technique|$branch|$task_name|$lm_task|$save_dir|$eval_dir|$backend") | |
| RUNNING_GPUS+=($gpu) | |
| job_idx=$((job_idx + 1)) | |
| done | |
| # Check for completed jobs | |
| new_pids=() | |
| new_logs=() | |
| new_info=() | |
| new_gpus=() | |
| for i in "${!RUNNING_PIDS[@]}"; do | |
| pid="${RUNNING_PIDS[$i]}" | |
| if kill -0 "$pid" 2>/dev/null; then | |
| # Still running, keep it | |
| new_pids+=("$pid") | |
| new_logs+=("${RUNNING_LOGS[$i]}") | |
| new_info+=("${RUNNING_INFO[$i]}") | |
| new_gpus+=("${RUNNING_GPUS[$i]}") | |
| else | |
| # Job finished | |
| wait "$pid" 2>/dev/null | |
| exit_code=$? | |
| log_file="${RUNNING_LOGS[$i]}" | |
| gpu="${RUNNING_GPUS[$i]}" | |
| IFS='|' read -r model_short scheme technique branch task_name lm_task save_dir eval_dir backend <<< "${RUNNING_INFO[$i]}" | |
| # Release GPU | |
| release_gpu "$gpu" | |
| if [ $exit_code -eq 0 ]; then | |
| metric_val=$(extract_metric "$eval_dir" "$lm_task") | |
| echo "$model_short,$scheme,$technique,$branch,$task_name,$metric_val,PASSED,$backend,$save_dir" >> "$RESULTS_CSV" | |
| echo "" | |
| echo "✓ COMPLETED: $model_short / $scheme / $technique / $branch / $task_name (GPU $gpu freed)" | |
| echo " Metric: $metric_val" | |
| echo " Log: $log_file" | |
| echo "" | |
| completed_count=$((completed_count + 1)) | |
| else | |
| echo "$model_short,$scheme,$technique,$branch,$task_name,N/A,FAILED,$backend,$save_dir" >> "$RESULTS_CSV" | |
| echo "" | |
| echo "✗ FAILED: $model_short / $scheme / $technique / $branch / $task_name (GPU $gpu freed)" | |
| echo " Log: $log_file" | |
| echo "" | |
| failed_count=$((failed_count + 1)) | |
| fi | |
| fi | |
| done | |
| # Update arrays | |
| RUNNING_PIDS=("${new_pids[@]}") | |
| RUNNING_LOGS=("${new_logs[@]}") | |
| RUNNING_INFO=("${new_info[@]}") | |
| RUNNING_GPUS=("${new_gpus[@]}") | |
| sleep 5 | |
| done | |
| # ── Final Summary ──────────────────────────────────────────────────────────── | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ FINAL SUMMARY ║" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| echo "Completed: $completed_count" | |
| echo "Failed: $failed_count" | |
| echo "Total: $((completed_count + failed_count))" | |
| echo "" | |
| echo "Results CSV: $RESULTS_CSV" | |
| echo "Eval logs: $EVAL_LOGS_DIR/" | |
| echo "" | |
| if [ -f "$RESULTS_CSV" ]; then | |
| echo "Results (sorted by model, technique, branch, task):" | |
| (head -1 "$RESULTS_CSV" && tail -n +2 "$RESULTS_CSV" | sort -t',' -k1,1 -k3,3 -k4,4 -k5,5) | column -t -s',' | |
| fi | |
| print_comparison |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Print current progress of parallel regression test | |
| RESULTS_CSV="parallel_regression_results.csv" | |
| if [ ! -f "$RESULTS_CSV" ]; then | |
| echo "No results file found: $RESULTS_CSV" | |
| exit 1 | |
| fi | |
| # Count results by status | |
| total=$(tail -n +2 "$RESULTS_CSV" | wc -l) | |
| passed=$(grep -c ",PASSED," "$RESULTS_CSV" || echo 0) | |
| cached=$(grep -c ",CACHED," "$RESULTS_CSV" || echo 0) | |
| failed=$(grep -c ",FAILED," "$RESULTS_CSV" || echo 0) | |
| completed=$((passed + cached)) | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════════════════════════════╗" | |
| echo "║ PROGRESS SUMMARY ║" | |
| echo "╚══════════════════════════════════════════════════════════════════════════════╝" | |
| echo "" | |
| echo "Completed: $completed (passed: $passed, cached: $cached)" | |
| echo "Failed: $failed" | |
| echo "Total: $total" | |
| echo "" | |
| # Print results table | |
| if [ $total -gt 0 ]; then | |
| echo "Results (sorted by model, technique, branch, task):" | |
| echo "────────────────────────────────────────────────────────────────────────────────" | |
| (head -1 "$RESULTS_CSV" && tail -n +2 "$RESULTS_CSV" | sort -t',' -k1,1 -k3,3 -k4,4 -k5,5) | column -t -s',' | |
| echo "" | |
| fi | |
| # Print comparison table | |
| python3 - "$RESULTS_CSV" <<'PYEOF' | |
| import csv, sys | |
| csv_path = sys.argv[1] | |
| rows = [] | |
| with open(csv_path) as f: | |
| reader = csv.DictReader(f) | |
| for r in reader: | |
| if r.get('status') in ['PASSED', 'CACHED']: | |
| rows.append(r) | |
| if not rows: | |
| sys.exit() | |
| # Build lookup: (model, scheme, technique, task) -> {branch: metric} | |
| lookup = {} | |
| for r in rows: | |
| key = (r["model"], r["scheme"], r["technique"], r["task"]) | |
| lookup.setdefault(key, {}) | |
| lookup[key][r["branch"]] = r["metric"] | |
| entries = [(k, v) for k, v in lookup.items() | |
| if "main" in v and any(b != "main" for b in v)] | |
| if not entries: | |
| sys.exit() | |
| pr_branch = [b for b in next(iter(lookup.values())) if b != "main"] | |
| pr_branch = pr_branch[0] if pr_branch else "pr" | |
| def parse_metric(s): | |
| s = s.strip() | |
| if s.endswith("%"): | |
| try: | |
| return float(s[:-1]), True | |
| except ValueError: | |
| return None, False | |
| try: | |
| return float(s), False | |
| except ValueError: | |
| return None, False | |
| def calc_change(main_str, pr_str, task): | |
| m_val, _ = parse_metric(main_str) | |
| p_val, _ = parse_metric(pr_str) | |
| if m_val is None or p_val is None or m_val == 0: | |
| return "N/A" | |
| if "wikitext" in task: | |
| pct = (m_val - p_val) / m_val * 100 | |
| else: | |
| pct = (p_val - m_val) / m_val * 100 | |
| sign = "+" if pct >= 0 else "" | |
| return f"{sign}{pct:.2f}%" | |
| print("") | |
| print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗") | |
| print(f"║ BRANCH COMPARISON (main vs {pr_branch})") | |
| print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝") | |
| print("") | |
| header = (f"{'model':<30} {'scheme':<10} {'technique':<12} {'task':<18} " | |
| f"{'main':>14} {'PR':>14} {'change':>12}") | |
| print(header) | |
| print("-" * len(header)) | |
| for (model, scheme, technique, task), metrics in sorted(entries): | |
| m = metrics.get("main", "") | |
| p = metrics.get(pr_branch, "") | |
| change = calc_change(m, p, task) if m and p else "" | |
| print(f"{model:<30} {scheme:<10} {technique:<12} {task:<18} " | |
| f"{m:>14} {p:>14} {change:>12}") | |
| print("") | |
| PYEOF |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import os | |
| import time | |
| import torch | |
| from compressed_tensors.offload import dispatch_model | |
| from compressed_tensors.quantization import preset_name_to_scheme | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # Monkey-patch os.chmod to ignore permission errors on shared cache | |
| _original_chmod = os.chmod | |
| def _chmod_ignore_errors(path, mode): | |
| try: | |
| _original_chmod(path, mode) | |
| except PermissionError: | |
| pass # Silently ignore chmod errors on shared cache files | |
| os.chmod = _chmod_ignore_errors | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.gptq import GPTQModifier | |
| from llmcompressor.modifiers.quantization import QuantizationModifier | |
| from llmcompressor.modifiers.transform.awq import AWQModifier | |
| from llmcompressor.modifiers.transform.imatrix import IMatrixGatherer | |
| from llmcompressor.modifiers.transform.smoothquant import SmoothQuantModifier | |
| MODEL_CONFIGS = { | |
| "Qwen/Qwen2.5-3B-Instruct": { | |
| "ignore": ["lm_head"], | |
| "is_moe": False, | |
| }, | |
| "meta-llama/Meta-Llama-3-8B-Instruct": { | |
| "ignore": ["lm_head"], | |
| "is_moe": False, | |
| }, | |
| "Qwen/Qwen3-30B-A3B": { | |
| "ignore": ["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"], | |
| "is_moe": True, | |
| }, | |
| } | |
| DATASET_ID = "HuggingFaceH4/ultrachat_200k" | |
| DATASET_SPLIT = "train_sft" | |
| def build_recipe(technique, scheme, ignore, is_moe): | |
| if technique == "awq_rtn": | |
| # duo_scaling only works with per-channel strategies (GROUP, CHANNEL) | |
| # FP8 uses TENSOR strategy, so disable duo_scaling for it | |
| if "FP8" in scheme or is_moe: | |
| duo = False | |
| else: | |
| duo = "both" | |
| return [ | |
| AWQModifier(duo_scaling=duo), | |
| QuantizationModifier( | |
| ignore=ignore, scheme=scheme, targets=["Linear"] | |
| ), | |
| ] | |
| elif technique == "rtn": | |
| return [ | |
| QuantizationModifier( | |
| ignore=ignore, scheme=scheme, targets=["Linear"] | |
| ), | |
| ] | |
| elif technique == "rtn_mse": | |
| scheme_obj = preset_name_to_scheme(scheme, ["Linear"]) | |
| scheme_obj.weights.observer = "memoryless_mse" | |
| return [ | |
| QuantizationModifier( | |
| config_groups={"group_0": scheme_obj}, | |
| ignore=ignore, | |
| ), | |
| ] | |
| elif technique == "gptq": | |
| recipe = [] | |
| if "W8A8" in scheme: | |
| recipe.append(SmoothQuantModifier(smoothing_strength=0.8)) | |
| recipe.append( | |
| GPTQModifier(ignore=ignore, scheme=scheme, targets=["Linear"]) | |
| ) | |
| return recipe | |
| elif technique == "imatrix": | |
| scheme_obj = preset_name_to_scheme(scheme, ["Linear"]) | |
| scheme_obj.weights.observer = "imatrix_mse" | |
| return [ | |
| IMatrixGatherer(ignore=ignore), | |
| QuantizationModifier( | |
| config_groups={"group_0": scheme_obj}, | |
| ignore=ignore, | |
| ), | |
| ] | |
| else: | |
| raise ValueError(f"Unknown technique: {technique}") | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--model", required=True) | |
| parser.add_argument( | |
| "--technique", required=True, | |
| choices=["awq_rtn", "rtn", "rtn_mse", "gptq", "imatrix"], | |
| ) | |
| parser.add_argument("--scheme", required=True) | |
| parser.add_argument("--save-dir", required=True) | |
| parser.add_argument("--num-samples", type=int, default=256) | |
| parser.add_argument("--max-seq-length", type=int, default=512) | |
| args = parser.parse_args() | |
| config = MODEL_CONFIGS.get(args.model) | |
| if config is None: | |
| raise ValueError( | |
| f"Unknown model: {args.model}. " | |
| f"Known models: {list(MODEL_CONFIGS.keys())}" | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained(args.model, dtype="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) | |
| ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{args.num_samples}]") | |
| ds = ds.shuffle(seed=42) | |
| def preprocess(example): | |
| return { | |
| "text": tokenizer.apply_chat_template( | |
| example["messages"], | |
| tokenize=False, | |
| ) | |
| } | |
| ds = ds.map(preprocess) | |
| def tokenize(sample): | |
| return tokenizer( | |
| sample["text"], | |
| padding=False, | |
| max_length=args.max_seq_length, | |
| truncation=True, | |
| add_special_tokens=False, | |
| ) | |
| ds = ds.map(tokenize, remove_columns=ds.column_names) | |
| recipe = build_recipe( | |
| args.technique, args.scheme, config["ignore"], config["is_moe"] | |
| ) | |
| torch.cuda.reset_peak_memory_stats() | |
| start_time = time.time() | |
| oneshot_kwargs = dict( | |
| model=model, | |
| dataset=ds, | |
| recipe=recipe, | |
| max_seq_length=args.max_seq_length, | |
| num_calibration_samples=args.num_samples, | |
| ) | |
| oneshot(**oneshot_kwargs) | |
| elapsed_time = time.time() - start_time | |
| peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3) | |
| print("Quantization Complete") | |
| print(f"Technique: {args.technique}, Scheme: {args.scheme}") | |
| print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)") | |
| print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB") | |
| model.save_pretrained(args.save_dir, save_compressed=True) | |
| tokenizer.save_pretrained(args.save_dir) | |
| print(f"Model saved to {args.save_dir}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment