HDCharles · May 5, 2026 17:17
diff --git a/parallel_regression_test.sh b/parallel_regression_test.sh
 #!/bin/bash
 # Parallel Regression Test Script
 # 1. Quantizes remaining models (if needed)
 # 2. Runs evaluations in parallel (4 at a time)
 # 3. Saves individual logs for each eval job
 # 4. Prints summaries as jobs complete

 set -o pipefail

 # ── Configuration ────────────────────────────────────────────────────────────

 REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)"
 export HF_DATASETS_CACHE="$HOME/hf_hub"
 mkdir -p "$HF_DATASETS_CACHE"

 MODEL_BASE_DIR="$HOME/hf_hub/regression_models"
 EVAL_BASE_DIR="./eval_results"
 EVAL_LOGS_DIR="./eval_logs"
 RESULTS_CSV="parallel_regression_results.csv"

 # Models to test with their VLLM args (max_model_len,tp_size)
 declare -A MODELS=(
    ["Qwen/Qwen2.5-3B-Instruct"]="Qwen2.5-3B-Instruct,2048,1"
    ["meta-llama/Meta-Llama-3-8B-Instruct"]="Meta-Llama-3-8B-Instruct,2048,1"
    ["Qwen/Qwen3-30B-A3B"]="Qwen3-30B-A3B,2048,2"
 )

 TECHNIQUES=("awq_rtn" "rtn" "rtn_mse" "gptq" "imatrix")
 BRANCHES=("main" "90_refactor_obs")
 SCHEMES=("NVFP4" "FP8")

 EVAL_TASKS=("wikitext" "mmlu")
 EVAL_LM_TASKS=("wikitext" "mmlu")
 EVAL_FEWSHOT=("0" "5")

 # Parallel config (both quantization and evaluation)
 MAX_PARALLEL_JOBS=4

 mkdir -p "$EVAL_BASE_DIR" "$MODEL_BASE_DIR" "$EVAL_LOGS_DIR"

 # Detect GPUs reserved by current user via chg
 detect_reserved_gpus() {
    local current_user=$(whoami)

    # Parse chg status to find GPUs reserved by current user
    # Strip ANSI codes, parse table, find IN_USE rows with current user
    local reserved=$(chg status 2>/dev/null | sed 's/\x1b\[[0-9;]*m//g' | \
        awk -F '│' -v user="$current_user" 'NR > 2 && $3 ~ user && $2 ~ /IN_USE/ {
            gsub(/^[ \t]+|[ \t]+$/, "", $1);
            print $1
        }')

    echo "$reserved"
 }

 echo "Detecting reserved GPUs via chg status..."
 AVAILABLE_GPUS=($(detect_reserved_gpus))

 if [ ${#AVAILABLE_GPUS[@]} -eq 0 ]; then
    echo "ERROR: No GPUs reserved. Please reserve GPUs using 'chg reserve <gpu_ids>' first."
    echo "Example: chg reserve 0,1,2,3"
    exit 1
 fi

 echo "Reserved GPUs detected: ${AVAILABLE_GPUS[@]}"
 echo "Will use up to $MAX_PARALLEL_JOBS parallel jobs"
 echo ""

 # GPU allocation tracking
 declare -A GPU_IN_USE
 for gpu in "${AVAILABLE_GPUS[@]}"; do
    GPU_IN_USE[$gpu]=0
 done

 # Helper: get next available GPU from reserved pool
 # Sets ALLOCATED_GPU to the GPU ID, or empty string if none available
 get_free_gpu() {
    ALLOCATED_GPU=""

    # Show internal tracking status
    local internal_status=""
    for gpu in "${AVAILABLE_GPUS[@]}"; do
        internal_status+="GPU$gpu:${GPU_IN_USE[$gpu]} "
    done
    echo "[DEBUG] Reserved GPU tracking: $internal_status"

    # Find a GPU that's not currently allocated by us
    for gpu in "${AVAILABLE_GPUS[@]}"; do
        if [ "${GPU_IN_USE[$gpu]}" -eq 0 ]; then
            # GPU is available, claim it
            GPU_IN_USE[$gpu]=1
            ALLOCATED_GPU=$gpu
            echo "[DEBUG] Allocated GPU $gpu"
            return 0
        fi
    done

    # No GPU available
    echo "[DEBUG] No free GPUs available (all currently allocated by script)"
    return 1
 }

 # Helper: release GPU
 release_gpu() {
    local gpu=$1
    GPU_IN_USE[$gpu]=0
 }

 # ── Helper: activate environments ────────────────────────────────────────────

 activate_quant_env() {
    source /home/HDCharles/rhdev/bin/activate
 }

 activate_eval_env() {
    source /home/HDCharles/vllm/bin/activate
 }

 # ── Helper: checkout branch and reinstall ────────────────────────────────────

 switch_branch() {
    local branch=$1
    echo "  Switching to branch: $branch"
    git -C "$REPO_DIR" checkout "$branch" 2>&1 | tail -5
    if [ $? -ne 0 ]; then
        echo "  ERROR: git checkout $branch failed"
        return 1
    fi
    activate_quant_env
    pip install -e "$REPO_DIR" 2>&1 | tail -1
    echo "  Installed llm-compressor from branch $branch"
 }

 # ── Helper: quantize a model ─────────────────────────────────────────────────

 quantize_model() {
    local model=$1
    local model_short=$2
    local technique=$3
    local scheme=$4
    local branch=$5
    local save_dir=$6

    echo ""
    echo "╔══════════════════════════════════════════════════════════════════════════════╗"
    echo "║  QUANTIZING: $model_short / $technique / $scheme / $branch"
    echo "╚══════════════════════════════════════════════════════════════════════════════╝"
    echo ""

    if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then
        echo "Quantized model already exists at $save_dir, skipping."
        return 0
    fi

    activate_quant_env

    python "$REPO_DIR/testing/quantize.py" \
        --model "$model" \
        --technique "$technique" \
        --scheme "$scheme" \
        --save-dir "$save_dir" 2>&1

    if [ $? -ne 0 ]; then
        echo "QUANTIZATION FAILED for $model_short / $scheme / $technique / $branch"
        return 1
    fi

    echo "Model saved to $save_dir"
    return 0
 }

 # ── Helper: run single evaluation (called in background) ────────────────────

 run_single_eval() {
    local model_short=$1
    local scheme=$2
    local technique=$3
    local branch=$4
    local task_name=$5
    local lm_task=$6
    local fewshot=$7
    local save_dir=$8
    local eval_dir=$9
    local log_file="${10}"

    {
        echo "════════════════════════════════════════════════════════════════"
        echo "EVAL START: $model_short / $technique / $branch / $task_name"
        echo "Task: $lm_task, Fewshot: $fewshot"
        echo "════════════════════════════════════════════════════════════════"
        echo ""

        mkdir -p "$eval_dir"
        activate_eval_env

        local result="FAILED"
        local backend="FAILED"

        # Try HF backend for NVFP4
        if [ "$scheme" == "NVFP4" ]; then
            echo "Using HF backend for NVFP4..."
            local chat_args="--apply_chat_template"
            if [ "$fewshot" -gt 0 ]; then
                chat_args="$chat_args --fewshot_as_multiturn"
            fi

            lm_eval \
                --model hf \
                --model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
                --tasks "$lm_task" \
                --num_fewshot "$fewshot" \
                --batch_size auto \
                $chat_args \
                --output_path "$eval_dir" 2>&1

            if [ $? -eq 0 ]; then
                result="PASSED"
                backend="hf"
            fi
        fi

        echo ""
        echo "════════════════════════════════════════════════════════════════"
        echo "EVAL COMPLETE: $result"
        echo "Backend: $backend"
        echo "════════════════════════════════════════════════════════════════"

        # Return status via exit code
        if [ "$result" == "PASSED" ]; then
            exit 0
        else
            exit 1
        fi
    } &> "$log_file"

    return $?
 }

 # ── Helper: extract metric from eval results ────────────────────────────────

 extract_metric() {
    local eval_output_dir=$1
    local task=$2

    local results_json
    results_json=$(find "$eval_output_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1)

    if [ -z "$results_json" ]; then
        echo "N/A"
        return
    fi

    python3 -c "
 import json, sys
 with open('$results_json') as f:
    data = json.load(f)
 results = data.get('results', {})
 task = '$task'

 task_results = None
 for key in results:
    if task in key:
        task_results = results[key]
        break

 if task_results is None:
    print('N/A')
    sys.exit()

 if 'gsm8k' in task:
    val = task_results.get('exact_match,strict-match')
    if val is not None:
        print(f'{val*100:.2f}%')
    else:
        print('N/A')
 elif 'wikitext' in task:
    val = task_results.get('word_perplexity,none')
    if val is not None:
        print(f'{val:.2f}')
    else:
        print('N/A')
 elif 'mmlu' in task:
    val = task_results.get('acc,none')
    if val is not None:
        print(f'{val*100:.2f}%')
    else:
        print('N/A')
 else:
    for k, v in task_results.items():
        if 'stderr' not in k and k != 'alias' and isinstance(v, (int, float)):
            print(f'{v:.4f}')
            sys.exit()
    print('N/A')
 " 2>/dev/null || echo "N/A"
 }

 # ── Helper: print comparison summary ─────────────────────────────────────────

 print_comparison() {
    if [ ! -f "$RESULTS_CSV" ]; then
        return
    fi

    python3 - "$RESULTS_CSV" <<'PYEOF'
 import csv, sys

 csv_path = sys.argv[1]

 rows = []
 with open(csv_path) as f:
    reader = csv.DictReader(f)
    for r in reader:
        if r.get('status') in ['PASSED', 'CACHED']:
            rows.append(r)

 if not rows:
    sys.exit()

 # Build lookup: (model, scheme, technique, task) -> {branch: metric}
 lookup = {}
 for r in rows:
    key = (r["model"], r["scheme"], r["technique"], r["task"])
    lookup.setdefault(key, {})
    lookup[key][r["branch"]] = r["metric"]

 entries = [(k, v) for k, v in lookup.items()
           if "main" in v and any(b != "main" for b in v)]
 if not entries:
    sys.exit()

 pr_branch = [b for b in next(iter(lookup.values())) if b != "main"]
 pr_branch = pr_branch[0] if pr_branch else "pr"

 def parse_metric(s):
    s = s.strip()
    if s.endswith("%"):
        try:
            return float(s[:-1]), True
        except ValueError:
            return None, False
    try:
        return float(s), False
    except ValueError:
        return None, False

 def calc_change(main_str, pr_str, task):
    m_val, _ = parse_metric(main_str)
    p_val, _ = parse_metric(pr_str)
    if m_val is None or p_val is None or m_val == 0:
        return "N/A"
    if "wikitext" in task:
        pct = (m_val - p_val) / m_val * 100
    else:
        pct = (p_val - m_val) / m_val * 100
    sign = "+" if pct >= 0 else ""
    return f"{sign}{pct:.2f}%"

 print("")
 print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗")
 print(f"║  BRANCH COMPARISON (main vs {pr_branch})")
 print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝")
 print("")

 header = (f"{'model':<30} {'scheme':<10} {'technique':<12} {'task':<18} "
          f"{'main':>14} {'PR':>14} {'change':>12}")
 print(header)
 print("-" * len(header))

 for (model, scheme, technique, task), metrics in sorted(entries):
    m = metrics.get("main", "")
    p = metrics.get(pr_branch, "")
    change = calc_change(m, p, task) if m and p else ""
    print(f"{model:<30} {scheme:<10} {technique:<12} {task:<18} "
          f"{m:>14} {p:>14} {change:>12}")

 print("")
 PYEOF
 }

 # ── Helper: run quantization in background ──────────────────────────────────

 run_quantize_job() {
    local model=$1
    local model_short=$2
    local technique=$3
    local scheme=$4
    local branch=$5
    local save_dir=$6
    local log_file=$7

    {
        echo "════════════════════════════════════════════════════════════════"
        echo "QUANT START: $model_short / $technique / $branch"
        echo "════════════════════════════════════════════════════════════════"
        echo ""

        activate_quant_env

        python "$REPO_DIR/testing/quantize.py" \
            --model "$model" \
            --technique "$technique" \
            --scheme "$scheme" \
            --save-dir "$save_dir" 2>&1

        if [ $? -eq 0 ]; then
            echo ""
            echo "════════════════════════════════════════════════════════════════"
            echo "QUANT COMPLETE: SUCCESS"
            echo "Model saved to $save_dir"
            echo "════════════════════════════════════════════════════════════════"
            exit 0
        else
            echo ""
            echo "════════════════════════════════════════════════════════════════"
            echo "QUANT COMPLETE: FAILED"
            echo "════════════════════════════════════════════════════════════════"
            exit 1
        fi
    } &> "$log_file"

    return $?
 }

 # ── Step 1: Parallel Quantization ───────────────────────────────────────────

 echo ""
 echo "╔══════════════════════════════════════════════════════════════════════════════╗"
 echo "║  STEP 1: PARALLEL QUANTIZATION (${MAX_PARALLEL_JOBS} jobs at a time)                                 ║"
 echo "╚══════════════════════════════════════════════════════════════════════════════╝"
 echo ""

 # Build list of quantization jobs needed
 declare -a QUANT_JOBS
 TOTAL_SKIPPED=0

 for model_key in "${!MODELS[@]}"; do
    IFS=',' read -r model_short max_len tp_size <<< "${MODELS[$model_key]}"

    for scheme in "${SCHEMES[@]}"; do
        for technique in "${TECHNIQUES[@]}"; do
            for branch in "${BRANCHES[@]}"; do
                save_dir="$MODEL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}"

                if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then
                    TOTAL_SKIPPED=$((TOTAL_SKIPPED + 1))
                    echo "  SKIP: $model_short / $scheme / $technique / $branch (already exists)"
                    continue
                fi

                # Add to quantization queue
                QUANT_JOBS+=("$model_key|$model_short|$scheme|$technique|$branch|$save_dir")
            done
        done
    done
 done

 # Sort jobs by branch to batch them efficiently
 if [ ${#QUANT_JOBS[@]} -gt 0 ]; then
    # Separate into main and other branches
    declare -a MAIN_JOBS
    declare -a OTHER_JOBS

    for job_info in "${QUANT_JOBS[@]}"; do
        IFS='|' read -r _ _ _ _ branch _ <<< "$job_info"
        if [ "$branch" == "main" ]; then
            MAIN_JOBS+=("$job_info")
        else
            OTHER_JOBS+=("$job_info")
        fi
    done

    # Rebuild QUANT_JOBS with main first, then others
    QUANT_JOBS=("${MAIN_JOBS[@]}" "${OTHER_JOBS[@]}")
 fi

 echo ""
 echo "Already quantized: $TOTAL_SKIPPED models"
 echo "Quantization jobs to run: ${#QUANT_JOBS[@]}"
 echo ""

 if [ ${#QUANT_JOBS[@]} -gt 0 ]; then
    echo "Models to be quantized:"
    echo "────────────────────────────────────────────────────────────────"
    for job_info in "${QUANT_JOBS[@]}"; do
        IFS='|' read -r _ model_short scheme technique branch _ <<< "$job_info"
        echo "  • $model_short / $scheme / $technique / $branch"
    done
    echo "────────────────────────────────────────────────────────────────"
    echo ""
 fi

 # Run quantization jobs in parallel
 declare -a QUANT_PIDS
 declare -a QUANT_LOGS
 declare -a QUANT_INFO
 declare -a QUANT_GPUS

 quant_idx=0
 quant_completed=0
 quant_failed=0

 # Switch to first branch for initial setup
 if [ ${#QUANT_JOBS[@]} -gt 0 ]; then
    IFS='|' read -r _ _ _ _ first_branch _ <<< "${QUANT_JOBS[0]}"
    switch_branch "$first_branch"
 fi

 while [ $quant_idx -lt ${#QUANT_JOBS[@]} ] || [ ${#QUANT_PIDS[@]} -gt 0 ]; do
    # Start new jobs if we have capacity and a GPU is available
    while [ ${#QUANT_PIDS[@]} -lt $MAX_PARALLEL_JOBS ] && [ $quant_idx -lt ${#QUANT_JOBS[@]} ]; do
        # Try to get a free GPU
        get_free_gpu
        if [ -z "$ALLOCATED_GPU" ]; then
            break  # No GPUs available, wait
        fi
        gpu=$ALLOCATED_GPU

        # Parse job info
        job_info="${QUANT_JOBS[$quant_idx]}"
        IFS='|' read -r model_key model_short scheme technique branch save_dir <<< "$job_info"

        # Check if we need to switch branches
        current_branch=$(git -C "$REPO_DIR" rev-parse --abbrev-ref HEAD 2>/dev/null)
        if [ "$current_branch" != "$branch" ]; then
            # Wait for all running jobs to finish before switching
            if [ ${#QUANT_PIDS[@]} -gt 0 ]; then
                release_gpu "$gpu"
                break
            fi
            switch_branch "$branch"
        fi

        # Create log file
        timestamp=$(date +%Y%m%d-%H%M%S)
        log_file="$EVAL_LOGS_DIR/${timestamp}_QUANT_${model_short}_${scheme}_${technique}_${branch}.log"

        echo "Starting quant job $((quant_idx + 1))/${#QUANT_JOBS[@]} on GPU $gpu: $model_short / $scheme / $technique / $branch"
        echo "  Log: $log_file"

        # Start background job with specific GPU
        CUDA_VISIBLE_DEVICES=$gpu bash -c "
            source /home/HDCharles/rhdev/bin/activate
            python '$REPO_DIR/testing/quantize.py' \
                --model '$model_key' \
                --technique '$technique' \
                --scheme '$scheme' \
                --save-dir '$save_dir' 2>&1
        " &> "$log_file" &

        pid=$!
        QUANT_PIDS+=($pid)
        QUANT_LOGS+=("$log_file")
        QUANT_INFO+=("$model_short|$scheme|$technique|$branch|$save_dir")
        QUANT_GPUS+=($gpu)

        quant_idx=$((quant_idx + 1))
    done

    # Check for completed jobs
    new_pids=()
    new_logs=()
    new_info=()
    new_gpus=()

    for i in "${!QUANT_PIDS[@]}"; do
        pid="${QUANT_PIDS[$i]}"

        if kill -0 "$pid" 2>/dev/null; then
            # Still running, keep it
            new_pids+=("$pid")
            new_logs+=("${QUANT_LOGS[$i]}")
            new_info+=("${QUANT_INFO[$i]}")
            new_gpus+=("${QUANT_GPUS[$i]}")
        else
            # Job finished
            wait "$pid" 2>/dev/null
            exit_code=$?

            log_file="${QUANT_LOGS[$i]}"
            gpu="${QUANT_GPUS[$i]}"
            IFS='|' read -r model_short scheme technique branch save_dir <<< "${QUANT_INFO[$i]}"

            # Release GPU
            release_gpu "$gpu"

            if [ $exit_code -eq 0 ]; then
                echo ""
                echo "✓ QUANT COMPLETED: $model_short / $scheme / $technique / $branch (GPU $gpu freed)"
                echo "  Saved to: $save_dir"
                echo "  Log: $log_file"
                echo ""
                quant_completed=$((quant_completed + 1))
            else
                echo ""
                echo "✗ QUANT FAILED: $model_short / $scheme / $technique / $branch (GPU $gpu freed)"
                echo "  Log: $log_file"
                echo ""
                quant_failed=$((quant_failed + 1))
            fi
        fi
    done

    # Update arrays
    QUANT_PIDS=("${new_pids[@]}")
    QUANT_LOGS=("${new_logs[@]}")
    QUANT_INFO=("${new_info[@]}")
    QUANT_GPUS=("${new_gpus[@]}")

    sleep 5
 done

 echo ""
 echo "Quantization phase complete:"
 echo "  Completed: $quant_completed"
 echo "  Failed: $quant_failed"
 echo "  Skipped: $TOTAL_SKIPPED (already existed)"
 echo ""

 # ── Step 2: Parallel Evaluation ─────────────────────────────────────────────

 echo ""
 echo "╔══════════════════════════════════════════════════════════════════════════════╗"
 echo "║  STEP 2: PARALLEL EVALUATION (${MAX_PARALLEL_JOBS} jobs at a time)                                   ║"
 echo "╚══════════════════════════════════════════════════════════════════════════════╝"
 echo ""

 # Initialize results CSV
 echo "model,scheme,technique,branch,task,metric,status,backend,save_dir" > "$RESULTS_CSV"

 # Build list of all eval jobs
 declare -a EVAL_JOBS

 # Process models in order: smallest to largest (3B, 8B, 30B)
 MODEL_ORDER=("Qwen/Qwen2.5-3B-Instruct" "meta-llama/Meta-Llama-3-8B-Instruct" "Qwen/Qwen3-30B-A3B")

 for model_key in "${MODEL_ORDER[@]}"; do
    # Skip if model not in MODELS array
    if [ -z "${MODELS[$model_key]}" ]; then
        continue
    fi

    IFS=',' read -r model_short max_len tp_size <<< "${MODELS[$model_key]}"

    for scheme in "${SCHEMES[@]}"; do
        for technique in "${TECHNIQUES[@]}"; do
            for branch in "${BRANCHES[@]}"; do
                save_dir="$MODEL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}"

                # Skip if model doesn't exist
                if [ ! -d "$save_dir" ] || [ ! -f "$save_dir/config.json" ]; then
                    continue
                fi

                for eval_idx in "${!EVAL_TASKS[@]}"; do
                    task_name="${EVAL_TASKS[$eval_idx]}"
                    lm_task="${EVAL_LM_TASKS[$eval_idx]}"
                    fewshot="${EVAL_FEWSHOT[$eval_idx]}"
                    eval_dir="$EVAL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}/${task_name}"

                    # Skip if results already exist
                    if find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | grep -q .; then
                        metric_val=$(extract_metric "$eval_dir" "$lm_task")
                        echo "$model_short,$scheme,$technique,$branch,$task_name,$metric_val,CACHED,cached,$save_dir" >> "$RESULTS_CSV"
                        echo "  CACHED: $model_short / $scheme / $technique / $branch / $task_name = $metric_val"
                        continue
                    fi

                    # Add to job queue (include max_len and tp_size for vLLM)
                    EVAL_JOBS+=("$save_dir|$model_short|$scheme|$technique|$branch|$task_name|$lm_task|$fewshot|$eval_dir|$max_len|$tp_size")
                done
            done
        done
    done
 done

 echo ""
 echo "Total evaluation jobs to run: ${#EVAL_JOBS[@]}"
 echo ""

 # Run evaluations in parallel
 declare -a RUNNING_PIDS
 declare -a RUNNING_LOGS
 declare -a RUNNING_INFO
 declare -a RUNNING_GPUS

 job_idx=0
 completed_count=0
 failed_count=0

 while [ $job_idx -lt ${#EVAL_JOBS[@]} ] || [ ${#RUNNING_PIDS[@]} -gt 0 ]; do
    # Start new jobs if we have capacity and a GPU is available
    while [ ${#RUNNING_PIDS[@]} -lt $MAX_PARALLEL_JOBS ] && [ $job_idx -lt ${#EVAL_JOBS[@]} ]; do
        # Try to get a free GPU
        get_free_gpu
        if [ -z "$ALLOCATED_GPU" ]; then
            break  # No GPUs available, wait
        fi
        gpu=$ALLOCATED_GPU

        # Parse job info
        job_info="${EVAL_JOBS[$job_idx]}"
        IFS='|' read -r save_dir model_short scheme technique branch task_name lm_task fewshot eval_dir max_len tp_size <<< "$job_info"

        # Create log file
        timestamp=$(date +%Y%m%d-%H%M%S)
        log_file="$EVAL_LOGS_DIR/${timestamp}_${model_short}_${scheme}_${technique}_${branch}_${task_name}.log"

        echo "Starting job $((job_idx + 1))/${#EVAL_JOBS[@]} on GPU $gpu: $model_short / $scheme / $technique / $branch / $task_name"
        echo "  Log: $log_file"

        # Choose backend based on scheme
        if [ "$scheme" == "FP8" ]; then
            # Use vLLM for FP8 (always TP=1 for single-GPU async handling)
            backend="vllm"
            CUDA_VISIBLE_DEVICES=$gpu bash -c "
                source /home/HDCharles/vllm/bin/activate
                mkdir -p '$eval_dir'

                chat_args='--apply_chat_template'
                if [ '$fewshot' -gt 0 ]; then
                    chat_args=\"\$chat_args --fewshot_as_multiturn\"
                fi

                lm_eval \
                    --model vllm \
                    --model_args 'pretrained=$save_dir,dtype=auto,max_model_len=$max_len,add_bos_token=True,gpu_memory_utilization=0.85' \
                    --tasks '$lm_task' \
                    --num_fewshot '$fewshot' \
                    --batch_size auto \
                    \$chat_args \
                    --output_path '$eval_dir' 2>&1
            " &> "$log_file" &
        else
            # Use HF for NVFP4
            backend="hf"
            CUDA_VISIBLE_DEVICES=$gpu bash -c "
                source /home/HDCharles/vllm/bin/activate
                mkdir -p '$eval_dir'

                chat_args='--apply_chat_template'
                if [ '$fewshot' -gt 0 ]; then
                    chat_args=\"\$chat_args --fewshot_as_multiturn\"
                fi

                lm_eval \
                    --model hf \
                    --model_args 'pretrained=$save_dir,dtype=auto,add_bos_token=True' \
                    --tasks '$lm_task' \
                    --num_fewshot '$fewshot' \
                    --batch_size auto \
                    \$chat_args \
                    --output_path '$eval_dir' 2>&1
            " &> "$log_file" &
        fi

        pid=$!
        RUNNING_PIDS+=($pid)
        RUNNING_LOGS+=("$log_file")
        RUNNING_INFO+=("$model_short|$scheme|$technique|$branch|$task_name|$lm_task|$save_dir|$eval_dir|$backend")
        RUNNING_GPUS+=($gpu)

        job_idx=$((job_idx + 1))
    done

    # Check for completed jobs
    new_pids=()
    new_logs=()
    new_info=()
    new_gpus=()

    for i in "${!RUNNING_PIDS[@]}"; do
        pid="${RUNNING_PIDS[$i]}"

        if kill -0 "$pid" 2>/dev/null; then
            # Still running, keep it
            new_pids+=("$pid")
            new_logs+=("${RUNNING_LOGS[$i]}")
            new_info+=("${RUNNING_INFO[$i]}")
            new_gpus+=("${RUNNING_GPUS[$i]}")
        else
            # Job finished
            wait "$pid" 2>/dev/null
            exit_code=$?

            log_file="${RUNNING_LOGS[$i]}"
            gpu="${RUNNING_GPUS[$i]}"
            IFS='|' read -r model_short scheme technique branch task_name lm_task save_dir eval_dir backend <<< "${RUNNING_INFO[$i]}"

            # Release GPU
            release_gpu "$gpu"

            if [ $exit_code -eq 0 ]; then
                metric_val=$(extract_metric "$eval_dir" "$lm_task")
                echo "$model_short,$scheme,$technique,$branch,$task_name,$metric_val,PASSED,$backend,$save_dir" >> "$RESULTS_CSV"

                echo ""
                echo "✓ COMPLETED: $model_short / $scheme / $technique / $branch / $task_name (GPU $gpu freed)"
                echo "  Metric: $metric_val"
                echo "  Log: $log_file"
                echo ""

                completed_count=$((completed_count + 1))
            else
                echo "$model_short,$scheme,$technique,$branch,$task_name,N/A,FAILED,$backend,$save_dir" >> "$RESULTS_CSV"

                echo ""
                echo "✗ FAILED: $model_short / $scheme / $technique / $branch / $task_name (GPU $gpu freed)"
                echo "  Log: $log_file"
                echo ""

                failed_count=$((failed_count + 1))
            fi
        fi
    done

    # Update arrays
    RUNNING_PIDS=("${new_pids[@]}")
    RUNNING_LOGS=("${new_logs[@]}")
    RUNNING_INFO=("${new_info[@]}")
    RUNNING_GPUS=("${new_gpus[@]}")

    sleep 5
 done

 # ── Final Summary ────────────────────────────────────────────────────────────

 echo ""
 echo "╔══════════════════════════════════════════════════════════════════════════════╗"
 echo "║  FINAL SUMMARY                                                               ║"
 echo "╚══════════════════════════════════════════════════════════════════════════════╝"
 echo ""
 echo "Completed: $completed_count"
 echo "Failed: $failed_count"
 echo "Total: $((completed_count + failed_count))"
 echo ""
 echo "Results CSV: $RESULTS_CSV"
 echo "Eval logs: $EVAL_LOGS_DIR/"
 echo ""

 if [ -f "$RESULTS_CSV" ]; then
    echo "Results (sorted by model, technique, branch, task):"
    (head -1 "$RESULTS_CSV" && tail -n +2 "$RESULTS_CSV" | sort -t',' -k1,1 -k3,3 -k4,4 -k5,5) | column -t -s','
 fi

 print_comparison
diff --git a/print_progress.sh b/print_progress.sh
 #!/bin/bash
 # Print current progress of parallel regression test

 RESULTS_CSV="parallel_regression_results.csv"

 if [ ! -f "$RESULTS_CSV" ]; then
    echo "No results file found: $RESULTS_CSV"
    exit 1
 fi

 # Count results by status
 total=$(tail -n +2 "$RESULTS_CSV" | wc -l)
 passed=$(grep -c ",PASSED," "$RESULTS_CSV" || echo 0)
 cached=$(grep -c ",CACHED," "$RESULTS_CSV" || echo 0)
 failed=$(grep -c ",FAILED," "$RESULTS_CSV" || echo 0)
 completed=$((passed + cached))

 echo ""
 echo "╔══════════════════════════════════════════════════════════════════════════════╗"
 echo "║  PROGRESS SUMMARY                                                            ║"
 echo "╚══════════════════════════════════════════════════════════════════════════════╝"
 echo ""
 echo "Completed: $completed (passed: $passed, cached: $cached)"
 echo "Failed: $failed"
 echo "Total: $total"
 echo ""

 # Print results table
 if [ $total -gt 0 ]; then
    echo "Results (sorted by model, technique, branch, task):"
    echo "────────────────────────────────────────────────────────────────────────────────"
    (head -1 "$RESULTS_CSV" && tail -n +2 "$RESULTS_CSV" | sort -t',' -k1,1 -k3,3 -k4,4 -k5,5) | column -t -s','
    echo ""
 fi

 # Print comparison table
 python3 - "$RESULTS_CSV" <<'PYEOF'
 import csv, sys

 csv_path = sys.argv[1]

 rows = []
 with open(csv_path) as f:
    reader = csv.DictReader(f)
    for r in reader:
        if r.get('status') in ['PASSED', 'CACHED']:
            rows.append(r)

 if not rows:
    sys.exit()

 # Build lookup: (model, scheme, technique, task) -> {branch: metric}
 lookup = {}
 for r in rows:
    key = (r["model"], r["scheme"], r["technique"], r["task"])
    lookup.setdefault(key, {})
    lookup[key][r["branch"]] = r["metric"]

 entries = [(k, v) for k, v in lookup.items()
           if "main" in v and any(b != "main" for b in v)]
 if not entries:
    sys.exit()

 pr_branch = [b for b in next(iter(lookup.values())) if b != "main"]
 pr_branch = pr_branch[0] if pr_branch else "pr"

 def parse_metric(s):
    s = s.strip()
    if s.endswith("%"):
        try:
            return float(s[:-1]), True
        except ValueError:
            return None, False
    try:
        return float(s), False
    except ValueError:
        return None, False

 def calc_change(main_str, pr_str, task):
    m_val, _ = parse_metric(main_str)
    p_val, _ = parse_metric(pr_str)
    if m_val is None or p_val is None or m_val == 0:
        return "N/A"
    if "wikitext" in task:
        pct = (m_val - p_val) / m_val * 100
    else:
        pct = (p_val - m_val) / m_val * 100
    sign = "+" if pct >= 0 else ""
    return f"{sign}{pct:.2f}%"

 print("")
 print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗")
 print(f"║  BRANCH COMPARISON (main vs {pr_branch})")
 print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝")
 print("")

 header = (f"{'model':<30} {'scheme':<10} {'technique':<12} {'task':<18} "
          f"{'main':>14} {'PR':>14} {'change':>12}")
 print(header)
 print("-" * len(header))

 for (model, scheme, technique, task), metrics in sorted(entries):
    m = metrics.get("main", "")
    p = metrics.get(pr_branch, "")
    change = calc_change(m, p, task) if m and p else ""
    print(f"{model:<30} {scheme:<10} {technique:<12} {task:<18} "
          f"{m:>14} {p:>14} {change:>12}")

 print("")
 PYEOF
diff --git a/quantize.py b/quantize.py
 import argparse
 import os
 import time

 import torch
 from compressed_tensors.offload import dispatch_model
 from compressed_tensors.quantization import preset_name_to_scheme
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer

 # Monkey-patch os.chmod to ignore permission errors on shared cache
 _original_chmod = os.chmod
 def _chmod_ignore_errors(path, mode):
    try:
        _original_chmod(path, mode)
    except PermissionError:
        pass  # Silently ignore chmod errors on shared cache files
 os.chmod = _chmod_ignore_errors

 from llmcompressor import oneshot
 from llmcompressor.modifiers.gptq import GPTQModifier
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.modifiers.transform.awq import AWQModifier
 from llmcompressor.modifiers.transform.imatrix import IMatrixGatherer
 from llmcompressor.modifiers.transform.smoothquant import SmoothQuantModifier

 MODEL_CONFIGS = {
    "Qwen/Qwen2.5-3B-Instruct": {
        "ignore": ["lm_head"],
        "is_moe": False,
    },
    "meta-llama/Meta-Llama-3-8B-Instruct": {
        "ignore": ["lm_head"],
        "is_moe": False,
    },
    "Qwen/Qwen3-30B-A3B": {
        "ignore": ["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
        "is_moe": True,
    },
 }

 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"


 def build_recipe(technique, scheme, ignore, is_moe):
    if technique == "awq_rtn":
        # duo_scaling only works with per-channel strategies (GROUP, CHANNEL)
        # FP8 uses TENSOR strategy, so disable duo_scaling for it
        if "FP8" in scheme or is_moe:
            duo = False
        else:
            duo = "both"
        return [
            AWQModifier(duo_scaling=duo),
            QuantizationModifier(
                ignore=ignore, scheme=scheme, targets=["Linear"]
            ),
        ]
    elif technique == "rtn":
        return [
            QuantizationModifier(
                ignore=ignore, scheme=scheme, targets=["Linear"]
            ),
        ]
    elif technique == "rtn_mse":
        scheme_obj = preset_name_to_scheme(scheme, ["Linear"])
        scheme_obj.weights.observer = "memoryless_mse"
        return [
            QuantizationModifier(
                config_groups={"group_0": scheme_obj},
                ignore=ignore,
            ),
        ]
    elif technique == "gptq":
        recipe = []
        if "W8A8" in scheme:
            recipe.append(SmoothQuantModifier(smoothing_strength=0.8))
        recipe.append(
            GPTQModifier(ignore=ignore, scheme=scheme, targets=["Linear"])
        )
        return recipe
    elif technique == "imatrix":
        scheme_obj = preset_name_to_scheme(scheme, ["Linear"])
        scheme_obj.weights.observer = "imatrix_mse"
        return [
            IMatrixGatherer(ignore=ignore),
            QuantizationModifier(
                config_groups={"group_0": scheme_obj},
                ignore=ignore,
            ),
        ]
    else:
        raise ValueError(f"Unknown technique: {technique}")


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True)
    parser.add_argument(
        "--technique", required=True,
        choices=["awq_rtn", "rtn", "rtn_mse", "gptq", "imatrix"],
    )
    parser.add_argument("--scheme", required=True)
    parser.add_argument("--save-dir", required=True)
    parser.add_argument("--num-samples", type=int, default=256)
    parser.add_argument("--max-seq-length", type=int, default=512)
    args = parser.parse_args()

    config = MODEL_CONFIGS.get(args.model)
    if config is None:
        raise ValueError(
            f"Unknown model: {args.model}. "
            f"Known models: {list(MODEL_CONFIGS.keys())}"
        )

    model = AutoModelForCausalLM.from_pretrained(args.model, dtype="auto")
    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)

    ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{args.num_samples}]")
    ds = ds.shuffle(seed=42)

    def preprocess(example):
        return {
            "text": tokenizer.apply_chat_template(
                example["messages"],
                tokenize=False,
            )
        }

    ds = ds.map(preprocess)

    def tokenize(sample):
        return tokenizer(
            sample["text"],
            padding=False,
            max_length=args.max_seq_length,
            truncation=True,
            add_special_tokens=False,
        )

    ds = ds.map(tokenize, remove_columns=ds.column_names)

    recipe = build_recipe(
        args.technique, args.scheme, config["ignore"], config["is_moe"]
    )

    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()

    oneshot_kwargs = dict(
        model=model,
        dataset=ds,
        recipe=recipe,
        max_seq_length=args.max_seq_length,
        num_calibration_samples=args.num_samples,
    )

    oneshot(**oneshot_kwargs)

    elapsed_time = time.time() - start_time
    peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
    print("Quantization Complete")
    print(f"Technique: {args.technique}, Scheme: {args.scheme}")
    print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
    print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")

    model.save_pretrained(args.save_dir, save_compressed=True)
    tokenizer.save_pretrained(args.save_dir)
    print(f"Model saved to {args.save_dir}")


 if __name__ == "__main__":
    main()
	#!/bin/bash
	# Print current progress of parallel regression test

	RESULTS_CSV="parallel_regression_results.csv"

	if [ ! -f "$RESULTS_CSV" ]; then
	echo "No results file found: $RESULTS_CSV"
	exit 1
	fi

	# Count results by status
	total=$(tail -n +2 "$RESULTS_CSV" \| wc -l)
	passed=$(grep -c ",PASSED," "$RESULTS_CSV" \|\| echo 0)
	cached=$(grep -c ",CACHED," "$RESULTS_CSV" \|\| echo 0)
	failed=$(grep -c ",FAILED," "$RESULTS_CSV" \|\| echo 0)
	completed=$((passed + cached))

	echo ""
	echo "╔══════════════════════════════════════════════════════════════════════════════╗"
	echo "║ PROGRESS SUMMARY ║"
	echo "╚══════════════════════════════════════════════════════════════════════════════╝"
	echo ""
	echo "Completed: $completed (passed: $passed, cached: $cached)"
	echo "Failed: $failed"
	echo "Total: $total"
	echo ""

	# Print results table
	if [ $total -gt 0 ]; then
	echo "Results (sorted by model, technique, branch, task):"
	echo "────────────────────────────────────────────────────────────────────────────────"
	(head -1 "$RESULTS_CSV" && tail -n +2 "$RESULTS_CSV" \| sort -t',' -k1,1 -k3,3 -k4,4 -k5,5) \| column -t -s','
	echo ""
	fi

	# Print comparison table
	python3 - "$RESULTS_CSV" <<'PYEOF'
	import csv, sys

	csv_path = sys.argv[1]

	rows = []
	with open(csv_path) as f:
	reader = csv.DictReader(f)
	for r in reader:
	if r.get('status') in ['PASSED', 'CACHED']:
	rows.append(r)

	if not rows:
	sys.exit()

	# Build lookup: (model, scheme, technique, task) -> {branch: metric}
	lookup = {}
	for r in rows:
	key = (r["model"], r["scheme"], r["technique"], r["task"])
	lookup.setdefault(key, {})
	lookup[key][r["branch"]] = r["metric"]

	entries = [(k, v) for k, v in lookup.items()
	if "main" in v and any(b != "main" for b in v)]
	if not entries:
	sys.exit()

	pr_branch = [b for b in next(iter(lookup.values())) if b != "main"]
	pr_branch = pr_branch[0] if pr_branch else "pr"

	def parse_metric(s):
	s = s.strip()
	if s.endswith("%"):
	try:
	return float(s[:-1]), True
	except ValueError:
	return None, False
	try:
	return float(s), False
	except ValueError:
	return None, False

	def calc_change(main_str, pr_str, task):
	m_val, _ = parse_metric(main_str)
	p_val, _ = parse_metric(pr_str)
	if m_val is None or p_val is None or m_val == 0:
	return "N/A"
	if "wikitext" in task:
	pct = (m_val - p_val) / m_val * 100
	else:
	pct = (p_val - m_val) / m_val * 100
	sign = "+" if pct >= 0 else ""
	return f"{sign}{pct:.2f}%"

	print("")
	print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗")
	print(f"║ BRANCH COMPARISON (main vs {pr_branch})")
	print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝")
	print("")

	header = (f"{'model':<30} {'scheme':<10} {'technique':<12} {'task':<18} "
	f"{'main':>14} {'PR':>14} {'change':>12}")
	print(header)
	print("-" * len(header))

	for (model, scheme, technique, task), metrics in sorted(entries):
	m = metrics.get("main", "")
	p = metrics.get(pr_branch, "")
	change = calc_change(m, p, task) if m and p else ""
	print(f"{model:<30} {scheme:<10} {technique:<12} {task:<18} "
	f"{m:>14} {p:>14} {change:>12}")

	print("")
	PYEOF
	import argparse
	import os
	import time

	import torch
	from compressed_tensors.offload import dispatch_model
	from compressed_tensors.quantization import preset_name_to_scheme
	from datasets import load_dataset
	from transformers import AutoModelForCausalLM, AutoTokenizer

	# Monkey-patch os.chmod to ignore permission errors on shared cache
	_original_chmod = os.chmod
	def _chmod_ignore_errors(path, mode):
	try:
	_original_chmod(path, mode)
	except PermissionError:
	pass # Silently ignore chmod errors on shared cache files
	os.chmod = _chmod_ignore_errors

	from llmcompressor import oneshot
	from llmcompressor.modifiers.gptq import GPTQModifier
	from llmcompressor.modifiers.quantization import QuantizationModifier
	from llmcompressor.modifiers.transform.awq import AWQModifier
	from llmcompressor.modifiers.transform.imatrix import IMatrixGatherer
	from llmcompressor.modifiers.transform.smoothquant import SmoothQuantModifier

	MODEL_CONFIGS = {
	"Qwen/Qwen2.5-3B-Instruct": {
	"ignore": ["lm_head"],
	"is_moe": False,
	},
	"meta-llama/Meta-Llama-3-8B-Instruct": {
	"ignore": ["lm_head"],
	"is_moe": False,
	},
	"Qwen/Qwen3-30B-A3B": {
	"ignore": ["lm_head", "re:.mlp.gate$", "re:.mlp.shared_expert_gate$"],
	"is_moe": True,
	},
	}

	DATASET_ID = "HuggingFaceH4/ultrachat_200k"
	DATASET_SPLIT = "train_sft"


	def build_recipe(technique, scheme, ignore, is_moe):
	if technique == "awq_rtn":
	# duo_scaling only works with per-channel strategies (GROUP, CHANNEL)
	# FP8 uses TENSOR strategy, so disable duo_scaling for it
	if "FP8" in scheme or is_moe:
	duo = False
	else:
	duo = "both"
	return [
	AWQModifier(duo_scaling=duo),
	QuantizationModifier(
	ignore=ignore, scheme=scheme, targets=["Linear"]
	),
	]
	elif technique == "rtn":
	return [
	QuantizationModifier(
	ignore=ignore, scheme=scheme, targets=["Linear"]
	),
	]
	elif technique == "rtn_mse":
	scheme_obj = preset_name_to_scheme(scheme, ["Linear"])
	scheme_obj.weights.observer = "memoryless_mse"
	return [
	QuantizationModifier(
	config_groups={"group_0": scheme_obj},
	ignore=ignore,
	),
	]
	elif technique == "gptq":
	recipe = []
	if "W8A8" in scheme:
	recipe.append(SmoothQuantModifier(smoothing_strength=0.8))
	recipe.append(
	GPTQModifier(ignore=ignore, scheme=scheme, targets=["Linear"])
	)
	return recipe
	elif technique == "imatrix":
	scheme_obj = preset_name_to_scheme(scheme, ["Linear"])
	scheme_obj.weights.observer = "imatrix_mse"
	return [
	IMatrixGatherer(ignore=ignore),
	QuantizationModifier(
	config_groups={"group_0": scheme_obj},
	ignore=ignore,
	),
	]
	else:
	raise ValueError(f"Unknown technique: {technique}")


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--model", required=True)
	parser.add_argument(
	"--technique", required=True,
	choices=["awq_rtn", "rtn", "rtn_mse", "gptq", "imatrix"],
	)
	parser.add_argument("--scheme", required=True)
	parser.add_argument("--save-dir", required=True)
	parser.add_argument("--num-samples", type=int, default=256)
	parser.add_argument("--max-seq-length", type=int, default=512)
	args = parser.parse_args()

	config = MODEL_CONFIGS.get(args.model)
	if config is None:
	raise ValueError(
	f"Unknown model: {args.model}. "
	f"Known models: {list(MODEL_CONFIGS.keys())}"
	)

	model = AutoModelForCausalLM.from_pretrained(args.model, dtype="auto")
	tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)

	ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{args.num_samples}]")
	ds = ds.shuffle(seed=42)

	def preprocess(example):
	return {
	"text": tokenizer.apply_chat_template(
	example["messages"],
	tokenize=False,
	)
	}

	ds = ds.map(preprocess)

	def tokenize(sample):
	return tokenizer(
	sample["text"],
	padding=False,
	max_length=args.max_seq_length,
	truncation=True,
	add_special_tokens=False,
	)

	ds = ds.map(tokenize, remove_columns=ds.column_names)

	recipe = build_recipe(
	args.technique, args.scheme, config["ignore"], config["is_moe"]
	)

	torch.cuda.reset_peak_memory_stats()
	start_time = time.time()

	oneshot_kwargs = dict(
	model=model,
	dataset=ds,
	recipe=recipe,
	max_seq_length=args.max_seq_length,
	num_calibration_samples=args.num_samples,
	)

	oneshot(**oneshot_kwargs)

	elapsed_time = time.time() - start_time
	peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
	print("Quantization Complete")
	print(f"Technique: {args.technique}, Scheme: {args.scheme}")
	print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
	print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")

	model.save_pretrained(args.save_dir, save_compressed=True)
	tokenizer.save_pretrained(args.save_dir)
	print(f"Model saved to {args.save_dir}")


	if __name__ == "__main__":
	main()