Skip to content

Instantly share code, notes, and snippets.

@HDCharles
Last active May 5, 2026 17:17
Show Gist options
  • Select an option

  • Save HDCharles/a8ece53d76ce89ce81ddeed4fba3aa28 to your computer and use it in GitHub Desktop.

Select an option

Save HDCharles/a8ece53d76ce89ce81ddeed4fba3aa28 to your computer and use it in GitHub Desktop.
Updated parallel regression test with chg GPU detection and FP8 support
#!/bin/bash
# Parallel Regression Test Script
# 1. Quantizes remaining models (if needed)
# 2. Runs evaluations in parallel (4 at a time)
# 3. Saves individual logs for each eval job
# 4. Prints summaries as jobs complete
set -o pipefail
# ── Configuration ────────────────────────────────────────────────────────────
REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)"
export HF_DATASETS_CACHE="$HOME/hf_hub"
mkdir -p "$HF_DATASETS_CACHE"
MODEL_BASE_DIR="$HOME/hf_hub/regression_models"
EVAL_BASE_DIR="./eval_results"
EVAL_LOGS_DIR="./eval_logs"
RESULTS_CSV="parallel_regression_results.csv"
# Models to test with their VLLM args (max_model_len,tp_size)
declare -A MODELS=(
["Qwen/Qwen2.5-3B-Instruct"]="Qwen2.5-3B-Instruct,2048,1"
["meta-llama/Meta-Llama-3-8B-Instruct"]="Meta-Llama-3-8B-Instruct,2048,1"
["Qwen/Qwen3-30B-A3B"]="Qwen3-30B-A3B,2048,2"
)
TECHNIQUES=("awq_rtn" "rtn" "rtn_mse" "gptq" "imatrix")
BRANCHES=("main" "90_refactor_obs")
SCHEMES=("NVFP4" "FP8")
EVAL_TASKS=("wikitext" "mmlu")
EVAL_LM_TASKS=("wikitext" "mmlu")
EVAL_FEWSHOT=("0" "5")
# Parallel config (both quantization and evaluation)
MAX_PARALLEL_JOBS=4
mkdir -p "$EVAL_BASE_DIR" "$MODEL_BASE_DIR" "$EVAL_LOGS_DIR"
# Detect GPUs reserved by current user via chg
detect_reserved_gpus() {
local current_user=$(whoami)
# Parse chg status to find GPUs reserved by current user
# Strip ANSI codes, parse table, find IN_USE rows with current user
local reserved=$(chg status 2>/dev/null | sed 's/\x1b\[[0-9;]*m//g' | \
awk -F '│' -v user="$current_user" 'NR > 2 && $3 ~ user && $2 ~ /IN_USE/ {
gsub(/^[ \t]+|[ \t]+$/, "", $1);
print $1
}')
echo "$reserved"
}
echo "Detecting reserved GPUs via chg status..."
AVAILABLE_GPUS=($(detect_reserved_gpus))
if [ ${#AVAILABLE_GPUS[@]} -eq 0 ]; then
echo "ERROR: No GPUs reserved. Please reserve GPUs using 'chg reserve <gpu_ids>' first."
echo "Example: chg reserve 0,1,2,3"
exit 1
fi
echo "Reserved GPUs detected: ${AVAILABLE_GPUS[@]}"
echo "Will use up to $MAX_PARALLEL_JOBS parallel jobs"
echo ""
# GPU allocation tracking
declare -A GPU_IN_USE
for gpu in "${AVAILABLE_GPUS[@]}"; do
GPU_IN_USE[$gpu]=0
done
# Helper: get next available GPU from reserved pool
# Sets ALLOCATED_GPU to the GPU ID, or empty string if none available
get_free_gpu() {
ALLOCATED_GPU=""
# Show internal tracking status
local internal_status=""
for gpu in "${AVAILABLE_GPUS[@]}"; do
internal_status+="GPU$gpu:${GPU_IN_USE[$gpu]} "
done
echo "[DEBUG] Reserved GPU tracking: $internal_status"
# Find a GPU that's not currently allocated by us
for gpu in "${AVAILABLE_GPUS[@]}"; do
if [ "${GPU_IN_USE[$gpu]}" -eq 0 ]; then
# GPU is available, claim it
GPU_IN_USE[$gpu]=1
ALLOCATED_GPU=$gpu
echo "[DEBUG] Allocated GPU $gpu"
return 0
fi
done
# No GPU available
echo "[DEBUG] No free GPUs available (all currently allocated by script)"
return 1
}
# Helper: release GPU
release_gpu() {
local gpu=$1
GPU_IN_USE[$gpu]=0
}
# ── Helper: activate environments ────────────────────────────────────────────
activate_quant_env() {
source /home/HDCharles/rhdev/bin/activate
}
activate_eval_env() {
source /home/HDCharles/vllm/bin/activate
}
# ── Helper: checkout branch and reinstall ────────────────────────────────────
switch_branch() {
local branch=$1
echo " Switching to branch: $branch"
git -C "$REPO_DIR" checkout "$branch" 2>&1 | tail -5
if [ $? -ne 0 ]; then
echo " ERROR: git checkout $branch failed"
return 1
fi
activate_quant_env
pip install -e "$REPO_DIR" 2>&1 | tail -1
echo " Installed llm-compressor from branch $branch"
}
# ── Helper: quantize a model ─────────────────────────────────────────────────
quantize_model() {
local model=$1
local model_short=$2
local technique=$3
local scheme=$4
local branch=$5
local save_dir=$6
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════╗"
echo "║ QUANTIZING: $model_short / $technique / $scheme / $branch"
echo "╚══════════════════════════════════════════════════════════════════════════════╝"
echo ""
if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then
echo "Quantized model already exists at $save_dir, skipping."
return 0
fi
activate_quant_env
python "$REPO_DIR/testing/quantize.py" \
--model "$model" \
--technique "$technique" \
--scheme "$scheme" \
--save-dir "$save_dir" 2>&1
if [ $? -ne 0 ]; then
echo "QUANTIZATION FAILED for $model_short / $scheme / $technique / $branch"
return 1
fi
echo "Model saved to $save_dir"
return 0
}
# ── Helper: run single evaluation (called in background) ────────────────────
run_single_eval() {
local model_short=$1
local scheme=$2
local technique=$3
local branch=$4
local task_name=$5
local lm_task=$6
local fewshot=$7
local save_dir=$8
local eval_dir=$9
local log_file="${10}"
{
echo "════════════════════════════════════════════════════════════════"
echo "EVAL START: $model_short / $technique / $branch / $task_name"
echo "Task: $lm_task, Fewshot: $fewshot"
echo "════════════════════════════════════════════════════════════════"
echo ""
mkdir -p "$eval_dir"
activate_eval_env
local result="FAILED"
local backend="FAILED"
# Try HF backend for NVFP4
if [ "$scheme" == "NVFP4" ]; then
echo "Using HF backend for NVFP4..."
local chat_args="--apply_chat_template"
if [ "$fewshot" -gt 0 ]; then
chat_args="$chat_args --fewshot_as_multiturn"
fi
lm_eval \
--model hf \
--model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
--tasks "$lm_task" \
--num_fewshot "$fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_dir" 2>&1
if [ $? -eq 0 ]; then
result="PASSED"
backend="hf"
fi
fi
echo ""
echo "════════════════════════════════════════════════════════════════"
echo "EVAL COMPLETE: $result"
echo "Backend: $backend"
echo "════════════════════════════════════════════════════════════════"
# Return status via exit code
if [ "$result" == "PASSED" ]; then
exit 0
else
exit 1
fi
} &> "$log_file"
return $?
}
# ── Helper: extract metric from eval results ────────────────────────────────
extract_metric() {
local eval_output_dir=$1
local task=$2
local results_json
results_json=$(find "$eval_output_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1)
if [ -z "$results_json" ]; then
echo "N/A"
return
fi
python3 -c "
import json, sys
with open('$results_json') as f:
data = json.load(f)
results = data.get('results', {})
task = '$task'
task_results = None
for key in results:
if task in key:
task_results = results[key]
break
if task_results is None:
print('N/A')
sys.exit()
if 'gsm8k' in task:
val = task_results.get('exact_match,strict-match')
if val is not None:
print(f'{val*100:.2f}%')
else:
print('N/A')
elif 'wikitext' in task:
val = task_results.get('word_perplexity,none')
if val is not None:
print(f'{val:.2f}')
else:
print('N/A')
elif 'mmlu' in task:
val = task_results.get('acc,none')
if val is not None:
print(f'{val*100:.2f}%')
else:
print('N/A')
else:
for k, v in task_results.items():
if 'stderr' not in k and k != 'alias' and isinstance(v, (int, float)):
print(f'{v:.4f}')
sys.exit()
print('N/A')
" 2>/dev/null || echo "N/A"
}
# ── Helper: print comparison summary ─────────────────────────────────────────
print_comparison() {
if [ ! -f "$RESULTS_CSV" ]; then
return
fi
python3 - "$RESULTS_CSV" <<'PYEOF'
import csv, sys
csv_path = sys.argv[1]
rows = []
with open(csv_path) as f:
reader = csv.DictReader(f)
for r in reader:
if r.get('status') in ['PASSED', 'CACHED']:
rows.append(r)
if not rows:
sys.exit()
# Build lookup: (model, scheme, technique, task) -> {branch: metric}
lookup = {}
for r in rows:
key = (r["model"], r["scheme"], r["technique"], r["task"])
lookup.setdefault(key, {})
lookup[key][r["branch"]] = r["metric"]
entries = [(k, v) for k, v in lookup.items()
if "main" in v and any(b != "main" for b in v)]
if not entries:
sys.exit()
pr_branch = [b for b in next(iter(lookup.values())) if b != "main"]
pr_branch = pr_branch[0] if pr_branch else "pr"
def parse_metric(s):
s = s.strip()
if s.endswith("%"):
try:
return float(s[:-1]), True
except ValueError:
return None, False
try:
return float(s), False
except ValueError:
return None, False
def calc_change(main_str, pr_str, task):
m_val, _ = parse_metric(main_str)
p_val, _ = parse_metric(pr_str)
if m_val is None or p_val is None or m_val == 0:
return "N/A"
if "wikitext" in task:
pct = (m_val - p_val) / m_val * 100
else:
pct = (p_val - m_val) / m_val * 100
sign = "+" if pct >= 0 else ""
return f"{sign}{pct:.2f}%"
print("")
print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗")
print(f"║ BRANCH COMPARISON (main vs {pr_branch})")
print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝")
print("")
header = (f"{'model':<30} {'scheme':<10} {'technique':<12} {'task':<18} "
f"{'main':>14} {'PR':>14} {'change':>12}")
print(header)
print("-" * len(header))
for (model, scheme, technique, task), metrics in sorted(entries):
m = metrics.get("main", "")
p = metrics.get(pr_branch, "")
change = calc_change(m, p, task) if m and p else ""
print(f"{model:<30} {scheme:<10} {technique:<12} {task:<18} "
f"{m:>14} {p:>14} {change:>12}")
print("")
PYEOF
}
# ── Helper: run quantization in background ──────────────────────────────────
run_quantize_job() {
local model=$1
local model_short=$2
local technique=$3
local scheme=$4
local branch=$5
local save_dir=$6
local log_file=$7
{
echo "════════════════════════════════════════════════════════════════"
echo "QUANT START: $model_short / $technique / $branch"
echo "════════════════════════════════════════════════════════════════"
echo ""
activate_quant_env
python "$REPO_DIR/testing/quantize.py" \
--model "$model" \
--technique "$technique" \
--scheme "$scheme" \
--save-dir "$save_dir" 2>&1
if [ $? -eq 0 ]; then
echo ""
echo "════════════════════════════════════════════════════════════════"
echo "QUANT COMPLETE: SUCCESS"
echo "Model saved to $save_dir"
echo "════════════════════════════════════════════════════════════════"
exit 0
else
echo ""
echo "════════════════════════════════════════════════════════════════"
echo "QUANT COMPLETE: FAILED"
echo "════════════════════════════════════════════════════════════════"
exit 1
fi
} &> "$log_file"
return $?
}
# ── Step 1: Parallel Quantization ───────────────────────────────────────────
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════╗"
echo "║ STEP 1: PARALLEL QUANTIZATION (${MAX_PARALLEL_JOBS} jobs at a time) ║"
echo "╚══════════════════════════════════════════════════════════════════════════════╝"
echo ""
# Build list of quantization jobs needed
declare -a QUANT_JOBS
TOTAL_SKIPPED=0
for model_key in "${!MODELS[@]}"; do
IFS=',' read -r model_short max_len tp_size <<< "${MODELS[$model_key]}"
for scheme in "${SCHEMES[@]}"; do
for technique in "${TECHNIQUES[@]}"; do
for branch in "${BRANCHES[@]}"; do
save_dir="$MODEL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}"
if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then
TOTAL_SKIPPED=$((TOTAL_SKIPPED + 1))
echo " SKIP: $model_short / $scheme / $technique / $branch (already exists)"
continue
fi
# Add to quantization queue
QUANT_JOBS+=("$model_key|$model_short|$scheme|$technique|$branch|$save_dir")
done
done
done
done
# Sort jobs by branch to batch them efficiently
if [ ${#QUANT_JOBS[@]} -gt 0 ]; then
# Separate into main and other branches
declare -a MAIN_JOBS
declare -a OTHER_JOBS
for job_info in "${QUANT_JOBS[@]}"; do
IFS='|' read -r _ _ _ _ branch _ <<< "$job_info"
if [ "$branch" == "main" ]; then
MAIN_JOBS+=("$job_info")
else
OTHER_JOBS+=("$job_info")
fi
done
# Rebuild QUANT_JOBS with main first, then others
QUANT_JOBS=("${MAIN_JOBS[@]}" "${OTHER_JOBS[@]}")
fi
echo ""
echo "Already quantized: $TOTAL_SKIPPED models"
echo "Quantization jobs to run: ${#QUANT_JOBS[@]}"
echo ""
if [ ${#QUANT_JOBS[@]} -gt 0 ]; then
echo "Models to be quantized:"
echo "────────────────────────────────────────────────────────────────"
for job_info in "${QUANT_JOBS[@]}"; do
IFS='|' read -r _ model_short scheme technique branch _ <<< "$job_info"
echo " • $model_short / $scheme / $technique / $branch"
done
echo "────────────────────────────────────────────────────────────────"
echo ""
fi
# Run quantization jobs in parallel
declare -a QUANT_PIDS
declare -a QUANT_LOGS
declare -a QUANT_INFO
declare -a QUANT_GPUS
quant_idx=0
quant_completed=0
quant_failed=0
# Switch to first branch for initial setup
if [ ${#QUANT_JOBS[@]} -gt 0 ]; then
IFS='|' read -r _ _ _ _ first_branch _ <<< "${QUANT_JOBS[0]}"
switch_branch "$first_branch"
fi
while [ $quant_idx -lt ${#QUANT_JOBS[@]} ] || [ ${#QUANT_PIDS[@]} -gt 0 ]; do
# Start new jobs if we have capacity and a GPU is available
while [ ${#QUANT_PIDS[@]} -lt $MAX_PARALLEL_JOBS ] && [ $quant_idx -lt ${#QUANT_JOBS[@]} ]; do
# Try to get a free GPU
get_free_gpu
if [ -z "$ALLOCATED_GPU" ]; then
break # No GPUs available, wait
fi
gpu=$ALLOCATED_GPU
# Parse job info
job_info="${QUANT_JOBS[$quant_idx]}"
IFS='|' read -r model_key model_short scheme technique branch save_dir <<< "$job_info"
# Check if we need to switch branches
current_branch=$(git -C "$REPO_DIR" rev-parse --abbrev-ref HEAD 2>/dev/null)
if [ "$current_branch" != "$branch" ]; then
# Wait for all running jobs to finish before switching
if [ ${#QUANT_PIDS[@]} -gt 0 ]; then
release_gpu "$gpu"
break
fi
switch_branch "$branch"
fi
# Create log file
timestamp=$(date +%Y%m%d-%H%M%S)
log_file="$EVAL_LOGS_DIR/${timestamp}_QUANT_${model_short}_${scheme}_${technique}_${branch}.log"
echo "Starting quant job $((quant_idx + 1))/${#QUANT_JOBS[@]} on GPU $gpu: $model_short / $scheme / $technique / $branch"
echo " Log: $log_file"
# Start background job with specific GPU
CUDA_VISIBLE_DEVICES=$gpu bash -c "
source /home/HDCharles/rhdev/bin/activate
python '$REPO_DIR/testing/quantize.py' \
--model '$model_key' \
--technique '$technique' \
--scheme '$scheme' \
--save-dir '$save_dir' 2>&1
" &> "$log_file" &
pid=$!
QUANT_PIDS+=($pid)
QUANT_LOGS+=("$log_file")
QUANT_INFO+=("$model_short|$scheme|$technique|$branch|$save_dir")
QUANT_GPUS+=($gpu)
quant_idx=$((quant_idx + 1))
done
# Check for completed jobs
new_pids=()
new_logs=()
new_info=()
new_gpus=()
for i in "${!QUANT_PIDS[@]}"; do
pid="${QUANT_PIDS[$i]}"
if kill -0 "$pid" 2>/dev/null; then
# Still running, keep it
new_pids+=("$pid")
new_logs+=("${QUANT_LOGS[$i]}")
new_info+=("${QUANT_INFO[$i]}")
new_gpus+=("${QUANT_GPUS[$i]}")
else
# Job finished
wait "$pid" 2>/dev/null
exit_code=$?
log_file="${QUANT_LOGS[$i]}"
gpu="${QUANT_GPUS[$i]}"
IFS='|' read -r model_short scheme technique branch save_dir <<< "${QUANT_INFO[$i]}"
# Release GPU
release_gpu "$gpu"
if [ $exit_code -eq 0 ]; then
echo ""
echo "✓ QUANT COMPLETED: $model_short / $scheme / $technique / $branch (GPU $gpu freed)"
echo " Saved to: $save_dir"
echo " Log: $log_file"
echo ""
quant_completed=$((quant_completed + 1))
else
echo ""
echo "✗ QUANT FAILED: $model_short / $scheme / $technique / $branch (GPU $gpu freed)"
echo " Log: $log_file"
echo ""
quant_failed=$((quant_failed + 1))
fi
fi
done
# Update arrays
QUANT_PIDS=("${new_pids[@]}")
QUANT_LOGS=("${new_logs[@]}")
QUANT_INFO=("${new_info[@]}")
QUANT_GPUS=("${new_gpus[@]}")
sleep 5
done
echo ""
echo "Quantization phase complete:"
echo " Completed: $quant_completed"
echo " Failed: $quant_failed"
echo " Skipped: $TOTAL_SKIPPED (already existed)"
echo ""
# ── Step 2: Parallel Evaluation ─────────────────────────────────────────────
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════╗"
echo "║ STEP 2: PARALLEL EVALUATION (${MAX_PARALLEL_JOBS} jobs at a time) ║"
echo "╚══════════════════════════════════════════════════════════════════════════════╝"
echo ""
# Initialize results CSV
echo "model,scheme,technique,branch,task,metric,status,backend,save_dir" > "$RESULTS_CSV"
# Build list of all eval jobs
declare -a EVAL_JOBS
# Process models in order: smallest to largest (3B, 8B, 30B)
MODEL_ORDER=("Qwen/Qwen2.5-3B-Instruct" "meta-llama/Meta-Llama-3-8B-Instruct" "Qwen/Qwen3-30B-A3B")
for model_key in "${MODEL_ORDER[@]}"; do
# Skip if model not in MODELS array
if [ -z "${MODELS[$model_key]}" ]; then
continue
fi
IFS=',' read -r model_short max_len tp_size <<< "${MODELS[$model_key]}"
for scheme in "${SCHEMES[@]}"; do
for technique in "${TECHNIQUES[@]}"; do
for branch in "${BRANCHES[@]}"; do
save_dir="$MODEL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}"
# Skip if model doesn't exist
if [ ! -d "$save_dir" ] || [ ! -f "$save_dir/config.json" ]; then
continue
fi
for eval_idx in "${!EVAL_TASKS[@]}"; do
task_name="${EVAL_TASKS[$eval_idx]}"
lm_task="${EVAL_LM_TASKS[$eval_idx]}"
fewshot="${EVAL_FEWSHOT[$eval_idx]}"
eval_dir="$EVAL_BASE_DIR/${model_short}-${scheme}-${technique}-${branch}/${task_name}"
# Skip if results already exist
if find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | grep -q .; then
metric_val=$(extract_metric "$eval_dir" "$lm_task")
echo "$model_short,$scheme,$technique,$branch,$task_name,$metric_val,CACHED,cached,$save_dir" >> "$RESULTS_CSV"
echo " CACHED: $model_short / $scheme / $technique / $branch / $task_name = $metric_val"
continue
fi
# Add to job queue (include max_len and tp_size for vLLM)
EVAL_JOBS+=("$save_dir|$model_short|$scheme|$technique|$branch|$task_name|$lm_task|$fewshot|$eval_dir|$max_len|$tp_size")
done
done
done
done
done
echo ""
echo "Total evaluation jobs to run: ${#EVAL_JOBS[@]}"
echo ""
# Run evaluations in parallel
declare -a RUNNING_PIDS
declare -a RUNNING_LOGS
declare -a RUNNING_INFO
declare -a RUNNING_GPUS
job_idx=0
completed_count=0
failed_count=0
while [ $job_idx -lt ${#EVAL_JOBS[@]} ] || [ ${#RUNNING_PIDS[@]} -gt 0 ]; do
# Start new jobs if we have capacity and a GPU is available
while [ ${#RUNNING_PIDS[@]} -lt $MAX_PARALLEL_JOBS ] && [ $job_idx -lt ${#EVAL_JOBS[@]} ]; do
# Try to get a free GPU
get_free_gpu
if [ -z "$ALLOCATED_GPU" ]; then
break # No GPUs available, wait
fi
gpu=$ALLOCATED_GPU
# Parse job info
job_info="${EVAL_JOBS[$job_idx]}"
IFS='|' read -r save_dir model_short scheme technique branch task_name lm_task fewshot eval_dir max_len tp_size <<< "$job_info"
# Create log file
timestamp=$(date +%Y%m%d-%H%M%S)
log_file="$EVAL_LOGS_DIR/${timestamp}_${model_short}_${scheme}_${technique}_${branch}_${task_name}.log"
echo "Starting job $((job_idx + 1))/${#EVAL_JOBS[@]} on GPU $gpu: $model_short / $scheme / $technique / $branch / $task_name"
echo " Log: $log_file"
# Choose backend based on scheme
if [ "$scheme" == "FP8" ]; then
# Use vLLM for FP8 (always TP=1 for single-GPU async handling)
backend="vllm"
CUDA_VISIBLE_DEVICES=$gpu bash -c "
source /home/HDCharles/vllm/bin/activate
mkdir -p '$eval_dir'
chat_args='--apply_chat_template'
if [ '$fewshot' -gt 0 ]; then
chat_args=\"\$chat_args --fewshot_as_multiturn\"
fi
lm_eval \
--model vllm \
--model_args 'pretrained=$save_dir,dtype=auto,max_model_len=$max_len,add_bos_token=True,gpu_memory_utilization=0.85' \
--tasks '$lm_task' \
--num_fewshot '$fewshot' \
--batch_size auto \
\$chat_args \
--output_path '$eval_dir' 2>&1
" &> "$log_file" &
else
# Use HF for NVFP4
backend="hf"
CUDA_VISIBLE_DEVICES=$gpu bash -c "
source /home/HDCharles/vllm/bin/activate
mkdir -p '$eval_dir'
chat_args='--apply_chat_template'
if [ '$fewshot' -gt 0 ]; then
chat_args=\"\$chat_args --fewshot_as_multiturn\"
fi
lm_eval \
--model hf \
--model_args 'pretrained=$save_dir,dtype=auto,add_bos_token=True' \
--tasks '$lm_task' \
--num_fewshot '$fewshot' \
--batch_size auto \
\$chat_args \
--output_path '$eval_dir' 2>&1
" &> "$log_file" &
fi
pid=$!
RUNNING_PIDS+=($pid)
RUNNING_LOGS+=("$log_file")
RUNNING_INFO+=("$model_short|$scheme|$technique|$branch|$task_name|$lm_task|$save_dir|$eval_dir|$backend")
RUNNING_GPUS+=($gpu)
job_idx=$((job_idx + 1))
done
# Check for completed jobs
new_pids=()
new_logs=()
new_info=()
new_gpus=()
for i in "${!RUNNING_PIDS[@]}"; do
pid="${RUNNING_PIDS[$i]}"
if kill -0 "$pid" 2>/dev/null; then
# Still running, keep it
new_pids+=("$pid")
new_logs+=("${RUNNING_LOGS[$i]}")
new_info+=("${RUNNING_INFO[$i]}")
new_gpus+=("${RUNNING_GPUS[$i]}")
else
# Job finished
wait "$pid" 2>/dev/null
exit_code=$?
log_file="${RUNNING_LOGS[$i]}"
gpu="${RUNNING_GPUS[$i]}"
IFS='|' read -r model_short scheme technique branch task_name lm_task save_dir eval_dir backend <<< "${RUNNING_INFO[$i]}"
# Release GPU
release_gpu "$gpu"
if [ $exit_code -eq 0 ]; then
metric_val=$(extract_metric "$eval_dir" "$lm_task")
echo "$model_short,$scheme,$technique,$branch,$task_name,$metric_val,PASSED,$backend,$save_dir" >> "$RESULTS_CSV"
echo ""
echo "✓ COMPLETED: $model_short / $scheme / $technique / $branch / $task_name (GPU $gpu freed)"
echo " Metric: $metric_val"
echo " Log: $log_file"
echo ""
completed_count=$((completed_count + 1))
else
echo "$model_short,$scheme,$technique,$branch,$task_name,N/A,FAILED,$backend,$save_dir" >> "$RESULTS_CSV"
echo ""
echo "✗ FAILED: $model_short / $scheme / $technique / $branch / $task_name (GPU $gpu freed)"
echo " Log: $log_file"
echo ""
failed_count=$((failed_count + 1))
fi
fi
done
# Update arrays
RUNNING_PIDS=("${new_pids[@]}")
RUNNING_LOGS=("${new_logs[@]}")
RUNNING_INFO=("${new_info[@]}")
RUNNING_GPUS=("${new_gpus[@]}")
sleep 5
done
# ── Final Summary ────────────────────────────────────────────────────────────
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════╗"
echo "║ FINAL SUMMARY ║"
echo "╚══════════════════════════════════════════════════════════════════════════════╝"
echo ""
echo "Completed: $completed_count"
echo "Failed: $failed_count"
echo "Total: $((completed_count + failed_count))"
echo ""
echo "Results CSV: $RESULTS_CSV"
echo "Eval logs: $EVAL_LOGS_DIR/"
echo ""
if [ -f "$RESULTS_CSV" ]; then
echo "Results (sorted by model, technique, branch, task):"
(head -1 "$RESULTS_CSV" && tail -n +2 "$RESULTS_CSV" | sort -t',' -k1,1 -k3,3 -k4,4 -k5,5) | column -t -s','
fi
print_comparison
#!/bin/bash
# Print current progress of parallel regression test
RESULTS_CSV="parallel_regression_results.csv"
if [ ! -f "$RESULTS_CSV" ]; then
echo "No results file found: $RESULTS_CSV"
exit 1
fi
# Count results by status
total=$(tail -n +2 "$RESULTS_CSV" | wc -l)
passed=$(grep -c ",PASSED," "$RESULTS_CSV" || echo 0)
cached=$(grep -c ",CACHED," "$RESULTS_CSV" || echo 0)
failed=$(grep -c ",FAILED," "$RESULTS_CSV" || echo 0)
completed=$((passed + cached))
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════╗"
echo "║ PROGRESS SUMMARY ║"
echo "╚══════════════════════════════════════════════════════════════════════════════╝"
echo ""
echo "Completed: $completed (passed: $passed, cached: $cached)"
echo "Failed: $failed"
echo "Total: $total"
echo ""
# Print results table
if [ $total -gt 0 ]; then
echo "Results (sorted by model, technique, branch, task):"
echo "────────────────────────────────────────────────────────────────────────────────"
(head -1 "$RESULTS_CSV" && tail -n +2 "$RESULTS_CSV" | sort -t',' -k1,1 -k3,3 -k4,4 -k5,5) | column -t -s','
echo ""
fi
# Print comparison table
python3 - "$RESULTS_CSV" <<'PYEOF'
import csv, sys
csv_path = sys.argv[1]
rows = []
with open(csv_path) as f:
reader = csv.DictReader(f)
for r in reader:
if r.get('status') in ['PASSED', 'CACHED']:
rows.append(r)
if not rows:
sys.exit()
# Build lookup: (model, scheme, technique, task) -> {branch: metric}
lookup = {}
for r in rows:
key = (r["model"], r["scheme"], r["technique"], r["task"])
lookup.setdefault(key, {})
lookup[key][r["branch"]] = r["metric"]
entries = [(k, v) for k, v in lookup.items()
if "main" in v and any(b != "main" for b in v)]
if not entries:
sys.exit()
pr_branch = [b for b in next(iter(lookup.values())) if b != "main"]
pr_branch = pr_branch[0] if pr_branch else "pr"
def parse_metric(s):
s = s.strip()
if s.endswith("%"):
try:
return float(s[:-1]), True
except ValueError:
return None, False
try:
return float(s), False
except ValueError:
return None, False
def calc_change(main_str, pr_str, task):
m_val, _ = parse_metric(main_str)
p_val, _ = parse_metric(pr_str)
if m_val is None or p_val is None or m_val == 0:
return "N/A"
if "wikitext" in task:
pct = (m_val - p_val) / m_val * 100
else:
pct = (p_val - m_val) / m_val * 100
sign = "+" if pct >= 0 else ""
return f"{sign}{pct:.2f}%"
print("")
print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗")
print(f"║ BRANCH COMPARISON (main vs {pr_branch})")
print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝")
print("")
header = (f"{'model':<30} {'scheme':<10} {'technique':<12} {'task':<18} "
f"{'main':>14} {'PR':>14} {'change':>12}")
print(header)
print("-" * len(header))
for (model, scheme, technique, task), metrics in sorted(entries):
m = metrics.get("main", "")
p = metrics.get(pr_branch, "")
change = calc_change(m, p, task) if m and p else ""
print(f"{model:<30} {scheme:<10} {technique:<12} {task:<18} "
f"{m:>14} {p:>14} {change:>12}")
print("")
PYEOF
import argparse
import os
import time
import torch
from compressed_tensors.offload import dispatch_model
from compressed_tensors.quantization import preset_name_to_scheme
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
# Monkey-patch os.chmod to ignore permission errors on shared cache
_original_chmod = os.chmod
def _chmod_ignore_errors(path, mode):
try:
_original_chmod(path, mode)
except PermissionError:
pass # Silently ignore chmod errors on shared cache files
os.chmod = _chmod_ignore_errors
from llmcompressor import oneshot
from llmcompressor.modifiers.gptq import GPTQModifier
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.modifiers.transform.awq import AWQModifier
from llmcompressor.modifiers.transform.imatrix import IMatrixGatherer
from llmcompressor.modifiers.transform.smoothquant import SmoothQuantModifier
MODEL_CONFIGS = {
"Qwen/Qwen2.5-3B-Instruct": {
"ignore": ["lm_head"],
"is_moe": False,
},
"meta-llama/Meta-Llama-3-8B-Instruct": {
"ignore": ["lm_head"],
"is_moe": False,
},
"Qwen/Qwen3-30B-A3B": {
"ignore": ["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
"is_moe": True,
},
}
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
def build_recipe(technique, scheme, ignore, is_moe):
if technique == "awq_rtn":
# duo_scaling only works with per-channel strategies (GROUP, CHANNEL)
# FP8 uses TENSOR strategy, so disable duo_scaling for it
if "FP8" in scheme or is_moe:
duo = False
else:
duo = "both"
return [
AWQModifier(duo_scaling=duo),
QuantizationModifier(
ignore=ignore, scheme=scheme, targets=["Linear"]
),
]
elif technique == "rtn":
return [
QuantizationModifier(
ignore=ignore, scheme=scheme, targets=["Linear"]
),
]
elif technique == "rtn_mse":
scheme_obj = preset_name_to_scheme(scheme, ["Linear"])
scheme_obj.weights.observer = "memoryless_mse"
return [
QuantizationModifier(
config_groups={"group_0": scheme_obj},
ignore=ignore,
),
]
elif technique == "gptq":
recipe = []
if "W8A8" in scheme:
recipe.append(SmoothQuantModifier(smoothing_strength=0.8))
recipe.append(
GPTQModifier(ignore=ignore, scheme=scheme, targets=["Linear"])
)
return recipe
elif technique == "imatrix":
scheme_obj = preset_name_to_scheme(scheme, ["Linear"])
scheme_obj.weights.observer = "imatrix_mse"
return [
IMatrixGatherer(ignore=ignore),
QuantizationModifier(
config_groups={"group_0": scheme_obj},
ignore=ignore,
),
]
else:
raise ValueError(f"Unknown technique: {technique}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True)
parser.add_argument(
"--technique", required=True,
choices=["awq_rtn", "rtn", "rtn_mse", "gptq", "imatrix"],
)
parser.add_argument("--scheme", required=True)
parser.add_argument("--save-dir", required=True)
parser.add_argument("--num-samples", type=int, default=256)
parser.add_argument("--max-seq-length", type=int, default=512)
args = parser.parse_args()
config = MODEL_CONFIGS.get(args.model)
if config is None:
raise ValueError(
f"Unknown model: {args.model}. "
f"Known models: {list(MODEL_CONFIGS.keys())}"
)
model = AutoModelForCausalLM.from_pretrained(args.model, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{args.num_samples}]")
ds = ds.shuffle(seed=42)
def preprocess(example):
return {
"text": tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
)
}
ds = ds.map(preprocess)
def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=args.max_seq_length,
truncation=True,
add_special_tokens=False,
)
ds = ds.map(tokenize, remove_columns=ds.column_names)
recipe = build_recipe(
args.technique, args.scheme, config["ignore"], config["is_moe"]
)
torch.cuda.reset_peak_memory_stats()
start_time = time.time()
oneshot_kwargs = dict(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=args.max_seq_length,
num_calibration_samples=args.num_samples,
)
oneshot(**oneshot_kwargs)
elapsed_time = time.time() - start_time
peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
print("Quantization Complete")
print(f"Technique: {args.technique}, Scheme: {args.scheme}")
print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")
model.save_pretrained(args.save_dir, save_compressed=True)
tokenizer.save_pretrained(args.save_dir)
print(f"Model saved to {args.save_dir}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment