Skip to content

Instantly share code, notes, and snippets.

@HDCharles
Last active April 25, 2026 02:59
Show Gist options
  • Select an option

  • Save HDCharles/dbe9b89ce5e44cd38c0790ab82c73af7 to your computer and use it in GitHub Desktop.

Select an option

Save HDCharles/dbe9b89ce5e44cd38c0790ab82c73af7 to your computer and use it in GitHub Desktop.
LLM Compressor Testing Setup - GPTQ actorder regression tests
#!/usr/bin/env python3
"""Extract summary data from AWQ DDP regression test log files.
Parses the log output from run_all_tests.sh and produces a comparison table
showing pre-DDP vs post-DDP results across models, schemes, and benchmarks.
Usage:
python extract_log_summary.py regression_results.log
"""
import re
import sys
from collections import defaultdict
def extract_log_summary(log_path):
with open(log_path, "r") as f:
content = f.read()
# Split into sections by the box-drawing delimiters
section_pattern = re.compile(
r"║\s+MODEL:\s+(.+?)\n"
r"\s*║\s+SCHEME:\s+(.+?)\n"
r"\s*║\s+CODE STATE:\s+(.+?)\n"
r".*?"
r"(?=║\s+MODEL:|╔══.*FINAL SUMMARY|\Z)",
re.DOTALL,
)
sections = section_pattern.findall(content)
# results[model][scheme][code_state] = {task: {metric: value}}
results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
timing = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
for model, scheme, code_state in sections:
model = model.strip()
scheme = scheme.strip()
code_state = code_state.strip()
# Find the body for this section
pattern = re.escape(f"MODEL: {model}") + r".*?" + re.escape(f"CODE STATE: {code_state}")
match = re.search(pattern, content, re.DOTALL)
if not match:
continue
# Get everything after the match until the next section
start = match.end()
next_section = re.search(r"║\s+MODEL:", content[start:])
end = start + next_section.start() if next_section else len(content)
body = content[start:end]
# Extract timing
time_match = re.search(
r"Time:\s*([\d.]+)\s*minutes\s*\(([\d.]+)\s*seconds\)", body
)
if time_match:
timing[model][scheme][code_state]["time_min"] = float(
time_match.group(1)
)
gpu_match = re.search(r"Peak GPU Memory:\s*([\d.]+)\s*GB", body)
if gpu_match:
timing[model][scheme][code_state]["gpu_gb"] = float(gpu_match.group(1))
# Extract GSM8K strict-match
strict_main_match = re.search(
r"strict-match\s*\|.*?\|.*?\|.*?\|([\d.]+)\|", body
)
if strict_main_match:
results[model][scheme][code_state]["gsm8k_strict"] = float(
strict_main_match.group(1)
)
# Extract wikitext word_perplexity
ppl_match = re.search(
r"word_perplexity\s*\|.*?\|.*?\|.*?\|([\d.]+)\|", body
)
if ppl_match:
results[model][scheme][code_state]["wikitext_ppl"] = float(
ppl_match.group(1)
)
# Extract MMLU accuracy
mmlu_match = re.search(
r"acc\s*\|.*?\|.*?\|.*?\|([\d.]+)\|", body
)
if mmlu_match:
results[model][scheme][code_state]["mmlu_acc"] = float(
mmlu_match.group(1)
)
# Print comparison table
print(f"\nLog: {log_path}\n")
metrics = ["gsm8k_strict", "wikitext_ppl", "mmlu_acc"]
metric_labels = ["GSM8K Strict", "Wiki PPL", "MMLU Acc"]
header = (
f"{'Model':<35} {'Scheme':<12} {'State':<10} "
f"{'Time':>7} {'GPU':>6} "
+ " ".join(f"{m:>12}" for m in metric_labels)
)
print(header)
print("-" * len(header))
for model in sorted(results.keys()):
for scheme in sorted(results[model].keys()):
for code_state in ["pre-ddp", "post-ddp"]:
r = results[model][scheme].get(code_state, {})
t = timing[model][scheme].get(code_state, {})
time_str = (
f"{t['time_min']:.1f}m" if "time_min" in t else "N/A"
)
gpu_str = (
f"{t['gpu_gb']:.1f}G" if "gpu_gb" in t else "N/A"
)
vals = []
for m in metrics:
if m in r:
vals.append(f"{r[m]:.4f}")
else:
vals.append("N/A")
print(
f"{model:<35} {scheme:<12} {code_state:<10} "
f"{time_str:>7} {gpu_str:>6} "
+ " ".join(f"{v:>12}" for v in vals)
)
# Print delta row
pre = results[model][scheme].get("pre-ddp", {})
post = results[model][scheme].get("post-ddp", {})
deltas = []
for m in metrics:
if m in pre and m in post:
diff = post[m] - pre[m]
sign = "+" if diff >= 0 else ""
# For perplexity, lower is better so flip the sign indicator
if m == "wikitext_ppl":
indicator = " *" if diff > 0 else ""
else:
indicator = " *" if diff < -0.01 else ""
deltas.append(f"{sign}{diff:.4f}{indicator}")
else:
deltas.append("---")
print(
f"{'':35} {'':12} {'delta':<10} "
f"{'':>7} {'':>6} "
+ " ".join(f"{d:>12}" for d in deltas)
)
print()
if __name__ == "__main__":
log_path = sys.argv[1] if len(sys.argv) > 1 else "regression_results.log"
extract_log_summary(log_path)
import argparse
import time
import torch
from compressed_tensors.offload import dispatch_model
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.gptq import GPTQModifier
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 512
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--actorder", default=None)
parser.add_argument("--save-dir", default=None)
parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES)
args = parser.parse_args()
num_samples = args.num_samples
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]")
ds = ds.shuffle(seed=42)
def preprocess(example):
return {
"text": tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
)
}
ds = ds.map(preprocess)
def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)
ds = ds.map(tokenize, remove_columns=ds.column_names)
gptq_kwargs = dict(
ignore=["lm_head"],
scheme="FP8_BLOCK",
targets=["Linear"],
)
if args.actorder:
gptq_kwargs["actorder"] = args.actorder
recipe = [GPTQModifier(**gptq_kwargs)]
torch.cuda.reset_peak_memory_stats()
start_time = time.time()
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=num_samples,
)
elapsed_time = time.time() - start_time
peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
print("Quantization Complete")
print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")
print("\n\n")
print("========== SAMPLE GENERATION ==============")
dispatch_model(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
model.device
)
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")
save_dir = args.save_dir or (
MODEL_ID.rstrip("/").split("/")[-1]
+ f"-FP8-BLOCK-GPTQ-{args.actorder or 'no-actorder'}"
)
model.save_pretrained(save_dir, save_compressed=True)
tokenizer.save_pretrained(save_dir)
print(f"Model saved to {save_dir}")
if __name__ == "__main__":
main()
import argparse
import time
import torch
from compressed_tensors.offload import dispatch_model
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.gptq import GPTQModifier
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 512
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--actorder", default=None)
parser.add_argument("--save-dir", default=None)
parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES)
args = parser.parse_args()
num_samples = args.num_samples
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]")
ds = ds.shuffle(seed=42)
def preprocess(example):
return {
"text": tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
)
}
ds = ds.map(preprocess)
def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)
ds = ds.map(tokenize, remove_columns=ds.column_names)
gptq_kwargs = dict(
ignore=["lm_head"],
scheme="W4A16",
targets=["Linear"],
)
if args.actorder:
gptq_kwargs["actorder"] = args.actorder
recipe = [GPTQModifier(**gptq_kwargs)]
torch.cuda.reset_peak_memory_stats()
start_time = time.time()
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=num_samples,
)
elapsed_time = time.time() - start_time
peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
print("Quantization Complete")
print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")
print("\n\n")
print("========== SAMPLE GENERATION ==============")
dispatch_model(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
model.device
)
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")
save_dir = args.save_dir or (
MODEL_ID.rstrip("/").split("/")[-1]
+ f"-W4A16-GPTQ-{args.actorder or 'no-actorder'}"
)
model.save_pretrained(save_dir, save_compressed=True)
tokenizer.save_pretrained(save_dir)
print(f"Model saved to {save_dir}")
if __name__ == "__main__":
main()
import argparse
import time
import torch
from compressed_tensors.offload import dispatch_model
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.gptq import GPTQModifier
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 512
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--actorder", default=None)
parser.add_argument("--save-dir", default=None)
parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES)
args = parser.parse_args()
num_samples = args.num_samples
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]")
ds = ds.shuffle(seed=42)
def preprocess(example):
return {
"text": tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
)
}
ds = ds.map(preprocess)
def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)
ds = ds.map(tokenize, remove_columns=ds.column_names)
gptq_kwargs = dict(
ignore=["lm_head"],
scheme="W8A16",
targets=["Linear"],
)
if args.actorder:
gptq_kwargs["actorder"] = args.actorder
recipe = [GPTQModifier(**gptq_kwargs)]
torch.cuda.reset_peak_memory_stats()
start_time = time.time()
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=num_samples,
)
elapsed_time = time.time() - start_time
peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
print("Quantization Complete")
print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")
print("\n\n")
print("========== SAMPLE GENERATION ==============")
dispatch_model(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
model.device
)
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")
save_dir = args.save_dir or (
MODEL_ID.rstrip("/").split("/")[-1]
+ f"-W8A16-GPTQ-{args.actorder or 'no-actorder'}"
)
model.save_pretrained(save_dir, save_compressed=True)
tokenizer.save_pretrained(save_dir)
print(f"Model saved to {save_dir}")
if __name__ == "__main__":
main()
import argparse
import base64
import time
from io import BytesIO
import torch
from compressed_tensors.offload import dispatch_model
from datasets import load_dataset
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from llmcompressor import oneshot
from llmcompressor.modifiers.gptq import GPTQModifier
MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
DATASET_ID = "lmms-lab/flickr30k"
DATASET_SPLIT = "test"
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--actorder", default=None)
parser.add_argument("--save-dir", default=None)
parser.add_argument("--num-samples", type=int, default=NUM_CALIBRATION_SAMPLES)
args = parser.parse_args()
num_samples = args.num_samples
model = Qwen3VLForConditionalGeneration.from_pretrained(
MODEL_ID, torch_dtype="auto"
)
processor = AutoProcessor.from_pretrained(MODEL_ID)
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{num_samples}]")
ds = ds.shuffle(seed=42)
def preprocess_and_tokenize(example):
buffered = BytesIO()
example["image"].save(buffered, format="PNG")
encoded_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
base64_qwen = f"data:image;base64,{encoded_image}"
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": base64_qwen},
{"type": "text", "text": "What does the image show?"},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
return processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
)
ds = ds.map(preprocess_and_tokenize, remove_columns=ds.column_names)
def data_collator(batch):
assert len(batch) == 1
return {key: torch.tensor(value) for key, value in batch[0].items()}
gptq_kwargs = dict(
ignore=["re:.*lm_head", "re:.*visual.*"],
scheme="FP8_BLOCK",
targets=["Linear"],
)
if args.actorder:
gptq_kwargs["actorder"] = args.actorder
recipe = [GPTQModifier(**gptq_kwargs)]
torch.cuda.reset_peak_memory_stats()
start_time = time.time()
oneshot(
model=model,
tokenizer=MODEL_ID,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=num_samples,
data_collator=data_collator,
sequential_targets=["Qwen3VLTextDecoderLayer"],
)
elapsed_time = time.time() - start_time
peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
print("Quantization Complete")
print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")
print("\n\n")
print("========== SAMPLE GENERATION ==============")
dispatch_model(model)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "http://images.cocodataset.org/train2017/000000231895.jpg",
},
{"type": "text", "text": "Please describe the animal in this image\n"},
],
}
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[prompt],
images=image_inputs,
videos=video_inputs,
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
return_tensors="pt",
).to(model.device)
output = model.generate(**inputs, max_new_tokens=100)
print(processor.decode(output[0], skip_special_tokens=True))
print("==========================================\n\n")
save_dir = args.save_dir or (
MODEL_ID.rstrip("/").split("/")[-1]
+ f"-FP8-BLOCK-GPTQ-{args.actorder or 'no-actorder'}"
)
model.save_pretrained(save_dir, save_compressed=True)
processor.save_pretrained(save_dir)
print(f"Model saved to {save_dir}")
if __name__ == "__main__":
main()
#!/bin/bash
# GPTQ actorder Regression Test Suite
# Compares without-actorder vs with-actorder (actorder=weight) for GPTQ
# quantization across models, schemes, and benchmarks.
#
# Usage:
# ./run_all_tests.sh 2>&1 | tee regression_results.log
# python extract_log_summary.py regression_results.log
#
# Models are saved to disk and NOT cleaned up for follow-up evaluation.
set -o pipefail
# Avoid permission errors on shared HF cache files
export HF_DATASETS_CACHE="/tmp/hf_datasets_cache"
mkdir -p "$HF_DATASETS_CACHE"
# ── Configuration ────────────────────────────────────────────────────────────
# Each entry defines one (script, model, scheme, vllm_args) test configuration.
# The scheme is tied to the script — no cross-product.
SCRIPTS=(
# "testing/llama3_fp8_block.py"
# "testing/qwen25_32b_fp8_block.py"
# "testing/qwen3_vl_fp8_block.py"
# "testing/llama4_scout_fp8_block.py"
# "testing/mixtral_fp8_block.py"
"testing/llama3_w4a16_gptq.py"
"testing/llama3_w8a16_gptq.py"
)
MODEL_SHORT_NAMES=(
# "Meta-Llama-3-8B-Instruct"
# "Qwen2.5-32B-Instruct"
# "Qwen3-VL-8B-Instruct"
# "Llama-4-Scout-17B-16E-Instruct"
# "Mixtral-8x7B-Instruct-v0.1"
"Meta-Llama-3-8B-Instruct"
"Meta-Llama-3-8B-Instruct"
)
# Scheme label per entry (used for naming and CSV output)
MODEL_SCHEMES=(
# "FP8_BLOCK"
# "FP8_BLOCK"
# "FP8_BLOCK"
# "FP8_BLOCK"
# "FP8_BLOCK"
"W4A16"
"W8A16"
)
# vLLM eval settings per entry: max_model_len,tensor_parallel_size,num_gpus_quant
MODEL_VLLM_ARGS=(
# "2048,1,1"
# "4096,2,1"
# "4096,1,1"
# "4096,2,1"
# "2048,2,1"
"2048,1,1"
"2048,1,1"
)
# without-actorder: no actorder flag (standard GPTQ)
# with-actorder: actorder=weight
# with-group-actorder: actorder=group
ACTORDER_STATES=("without-actorder" "with-actorder" "with-group-actorder")
# eval_name lm_eval_task fewshot backend
# gsm8k gsm8k 5 vllm
# gsm8k_platinum gsm8k_platinum 5 vllm
# wikitext wikitext 0 vllm
# mmlu mmlu 5 vllm
EVAL_NAMES=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu")
EVAL_LM_TASKS=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu")
EVAL_FEWSHOT=("5" "5" "0" "5")
EVAL_BACKENDS=("vllm" "vllm" "vllm" "vllm")
EVAL_BASE_DIR="./eval_results"
MODEL_BASE_DIR="./regression_models"
RESULTS_CSV="regression_results.csv"
mkdir -p "$EVAL_BASE_DIR" "$MODEL_BASE_DIR"
# ── Helper: activate environments ────────────────────────────────────────────
activate_quant_env() {
source /home/HDCharles/rhdev/bin/activate
}
activate_eval_env() {
source /home/HDCharles/vllm/bin/activate
}
# ── Helper: run vLLM evaluation with fallback chain ──────────────────────────
EVAL_BACKEND="" # set by run_vllm_eval to indicate which backend succeeded
run_vllm_eval() {
local save_dir=$1
local task=$2
local num_fewshot=$3
local max_model_len=$4
local tp_size=$5
local eval_output_dir=$6
mkdir -p "$eval_output_dir"
EVAL_BACKEND="FAILED"
activate_eval_env
echo " EVAL: $task (fewshot=$num_fewshot, tp=$tp_size, max_len=$max_model_len)"
# Build common eval flags
local chat_args="--apply_chat_template"
if [ "$num_fewshot" -gt 0 ]; then
chat_args="$chat_args --fewshot_as_multiturn"
fi
# Try with tensor_parallel
if [ "$tp_size" -gt 1 ]; then
lm_eval \
--model vllm \
--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,tensor_parallel_size=$tp_size,gpu_memory_utilization=0.85" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp${tp_size}"; return 0; fi
echo " TP=$tp_size failed, trying expert_parallel..."
lm_eval \
--model vllm \
--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enable_expert_parallel=True,gpu_memory_utilization=0.85" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_expert_parallel"; return 0; fi
fi
echo " Trying TP=1..."
lm_eval \
--model vllm \
--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,gpu_memory_utilization=0.85" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp1"; return 0; fi
echo " Trying enforce_eager..."
lm_eval \
--model vllm \
--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enforce_eager=True,gpu_memory_utilization=0.85" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_eager"; return 0; fi
echo " Trying hf backend as last resort..."
lm_eval \
--model hf \
--model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi
EVAL_BACKEND="FAILED"
return 1
}
# ── Helper: run HF-only evaluation ─────────────────────────────────────────
run_hf_eval() {
local save_dir=$1
local task=$2
local num_fewshot=$3
local eval_output_dir=$4
mkdir -p "$eval_output_dir"
EVAL_BACKEND="FAILED"
activate_eval_env
echo " EVAL: $task (fewshot=$num_fewshot, backend=hf)"
# Build common eval flags
local chat_args="--apply_chat_template"
if [ "$num_fewshot" -gt 0 ]; then
chat_args="$chat_args --fewshot_as_multiturn"
fi
lm_eval \
--model hf \
--model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi
EVAL_BACKEND="FAILED"
return 1
}
# ── Helper: extract metric from lm_eval JSON results ────────────────────────
extract_metric() {
local eval_output_dir=$1
local task=$2
# Find the most recent results JSON in the eval output dir
local results_json
results_json=$(find "$eval_output_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1)
if [ -z "$results_json" ]; then
echo "N/A"
return
fi
python3 -c "
import json, sys
with open('$results_json') as f:
data = json.load(f)
results = data.get('results', {})
task = '$task'
# Handle task name variations (e.g., wikitext vs wikitext2)
task_results = None
for key in results:
if task in key:
task_results = results[key]
break
if task_results is None:
print('N/A')
sys.exit()
# Extract the primary metric for each task
if 'gsm8k' in task:
val = task_results.get('exact_match,strict-match')
if val is not None:
print(f'{val*100:.2f}%')
else:
print('N/A')
elif 'wikitext' in task:
val = task_results.get('word_perplexity,none')
if val is not None:
print(f'{val:.2f}')
else:
print('N/A')
elif 'mmlu' in task:
val = task_results.get('acc,none')
if val is not None:
print(f'{val*100:.2f}%')
else:
print('N/A')
else:
# Generic: grab first non-stderr, non-alias metric
for k, v in task_results.items():
if 'stderr' not in k and k != 'alias' and isinstance(v, (int, float)):
print(f'{v:.4f}')
sys.exit()
print('N/A')
" 2>/dev/null || echo "N/A"
}
# ── Helper: print current results summary ────────────────────────────────────
print_summary() {
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗"
echo "║ RESULTS SUMMARY (so far) ║"
echo "╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝"
echo ""
if [ -f "$RESULTS_CSV" ]; then
# Print header + all rows as a formatted table
column -t -s',' < "$RESULTS_CSV"
else
echo "(no results yet)"
fi
echo ""
echo "════════════════════════════════════════════════════════════════════════════════════════════════════════"
echo ""
}
# ── Helper: print actorder comparison table ─────────────────────────────────
print_comparison() {
if [ ! -f "$RESULTS_CSV" ]; then
return
fi
python3 - "$RESULTS_CSV" <<'PYEOF'
import csv, sys
csv_path = sys.argv[1]
rows = []
with open(csv_path) as f:
reader = csv.DictReader(f)
for r in reader:
rows.append(r)
if not rows:
sys.exit()
actorder_keys = ["without-actorder", "with-actorder", "with-group-actorder"]
# Build lookup: (model, scheme, task) -> {actorder_state: metric}
lookup = {}
for r in rows:
key = (r["model"], r["scheme"], r["task"])
lookup.setdefault(key, {})
lookup[key][r["actorder"]] = r["metric"]
# Only print if we have at least one row with baseline + one other
entries = [(k, v) for k, v in lookup.items()
if "without-actorder" in v and
any(s in v for s in actorder_keys[1:])]
if not entries:
sys.exit()
def parse_metric(s):
s = s.strip()
if s.endswith("%"):
return float(s[:-1]), True
try:
return float(s), False
except ValueError:
return None, False
def calc_improvement(baseline_str, compare_str, task):
b_val, _ = parse_metric(baseline_str)
c_val, _ = parse_metric(compare_str)
if b_val is None or c_val is None or b_val == 0:
return "N/A"
if "wikitext" in task:
pct = (b_val - c_val) / b_val * 100
else:
pct = (c_val - b_val) / b_val * 100
sign = "+" if pct >= 0 else ""
return f"{sign}{pct:.2f}%"
print("")
print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗")
print("║ ACTORDER COMPARISON (vs without-actorder baseline) ║")
print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝")
print("")
header = (f"{'model':<36} {'scheme':<12} {'task':<18} "
f"{'no-actorder':>14} "
f"{'weight':>14} {'wt vs base':>12} "
f"{'group':>14} {'grp vs base':>12}")
print(header)
print("-" * len(header))
for (model, scheme, task), metrics in sorted(entries):
wo = metrics.get("without-actorder", "")
wi = metrics.get("with-actorder", "")
wg = metrics.get("with-group-actorder", "")
wi_imp = calc_improvement(wo, wi, task) if wo and wi else ""
wg_imp = calc_improvement(wo, wg, task) if wo and wg else ""
print(f"{model:<36} {scheme:<12} {task:<18} "
f"{wo:>14} "
f"{wi:>14} {wi_imp:>12} "
f"{wg:>14} {wg_imp:>12}")
print("")
PYEOF
}
# ── Initialize results CSV (preserve previous results) ──────────────────────
if [ -f "$RESULTS_CSV" ]; then
cp "$RESULTS_CSV" "${RESULTS_CSV}.bak"
fi
echo "model,scheme,actorder,task,metric,status,eval_backend,save_dir" > "$RESULTS_CSV"
# ── Main loop ────────────────────────────────────────────────────────────────
TOTAL=0
PASSED=0
FAILED=0
for model_idx in "${!SCRIPTS[@]}"; do
script="${SCRIPTS[$model_idx]}"
model_name="${MODEL_SHORT_NAMES[$model_idx]}"
scheme="${MODEL_SCHEMES[$model_idx]}"
IFS=',' read -r max_model_len tp_size num_gpus_quant <<< "${MODEL_VLLM_ARGS[$model_idx]}"
for actorder_state in "${ACTORDER_STATES[@]}"; do
save_dir="$MODEL_BASE_DIR/${model_name}-${scheme}-${actorder_state}"
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
echo "║ MODEL: $model_name"
echo "║ SCHEME: $scheme"
echo "║ ACTORDER: $actorder_state"
echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
echo ""
# ── Skip entirely if all evals already have results ────
all_evals_cached=true
for eval_idx in "${!EVAL_NAMES[@]}"; do
eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${actorder_state}/${EVAL_NAMES[$eval_idx]}"
if ! find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | grep -q .; then
all_evals_cached=false
break
fi
done
if [ "$all_evals_cached" = true ]; then
echo "All evals already cached, skipping quantization and eval."
for eval_idx in "${!EVAL_NAMES[@]}"; do
eval_name="${EVAL_NAMES[$eval_idx]}"
lm_task="${EVAL_LM_TASKS[$eval_idx]}"
eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${actorder_state}/${eval_name}"
metric_val=$(extract_metric "$eval_dir" "$lm_task")
echo " $eval_name: $metric_val"
echo "$model_name,$scheme,$actorder_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
PASSED=$((PASSED + 1))
TOTAL=$((TOTAL + 1))
done
print_summary
print_comparison
continue
fi
# ── Quantize (skip if model already exists) ────────────
if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then
echo "Quantized model already exists at $save_dir, skipping quantization."
else
activate_quant_env
echo "============================================"
echo "Running: $script (actorder_state=$actorder_state)"
echo "============================================"
# Build actorder argument
actorder_arg=""
if [ "$actorder_state" == "with-actorder" ]; then
actorder_arg="--actorder weight"
elif [ "$actorder_state" == "with-group-actorder" ]; then
actorder_arg="--actorder group"
fi
if [ "$num_gpus_quant" -gt 1 ]; then
torchrun --nproc_per_node="$num_gpus_quant" "$script" \
$actorder_arg --save-dir "$save_dir" 2>&1
else
python "$script" $actorder_arg --save-dir "$save_dir" 2>&1
fi
quant_status=$?
if [ $quant_status -ne 0 ]; then
echo "QUANTIZATION FAILED for $model_name / $scheme / $actorder_state"
for eval_name in "${EVAL_NAMES[@]}"; do
echo "$model_name,$scheme,$actorder_state,$eval_name,N/A,QUANT_FAILED,N/A,$save_dir" >> "$RESULTS_CSV"
done
FAILED=$((FAILED + ${#EVAL_NAMES[@]}))
TOTAL=$((TOTAL + ${#EVAL_NAMES[@]}))
print_summary
print_comparison
continue
fi
fi
# ── Clear GPU memory before eval ─────────────────────────
python3 -c "import torch; torch.cuda.empty_cache(); [torch.cuda.reset_peak_memory_stats(i) for i in range(torch.cuda.device_count())]" 2>/dev/null
# ── Evaluate ─────────────────────────────────────────────
for eval_idx in "${!EVAL_NAMES[@]}"; do
eval_name="${EVAL_NAMES[$eval_idx]}"
lm_task="${EVAL_LM_TASKS[$eval_idx]}"
fewshot="${EVAL_FEWSHOT[$eval_idx]}"
backend="${EVAL_BACKENDS[$eval_idx]}"
eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${actorder_state}/${eval_name}"
TOTAL=$((TOTAL + 1))
# Skip eval if results already exist
existing_result=$(find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1)
if [ -n "$existing_result" ]; then
metric_val=$(extract_metric "$eval_dir" "$lm_task")
echo " EVAL: $eval_name — skipping, previous result found: $metric_val"
echo "$model_name,$scheme,$actorder_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
PASSED=$((PASSED + 1))
continue
fi
if [ "$backend" == "hf" ]; then
run_hf_eval "$save_dir" "$lm_task" "$fewshot" "$eval_dir"
else
run_vllm_eval "$save_dir" "$lm_task" "$fewshot" "$max_model_len" "$tp_size" "$eval_dir"
fi
eval_status=$?
if [ $eval_status -eq 0 ]; then
metric_val=$(extract_metric "$eval_dir" "$lm_task")
echo "$model_name,$scheme,$actorder_state,$eval_name,$metric_val,PASSED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
PASSED=$((PASSED + 1))
else
echo "$model_name,$scheme,$actorder_state,$eval_name,N/A,FAILED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
FAILED=$((FAILED + 1))
fi
done
# ── Clean up model to free disk space ────────────────────
if [ -d "$save_dir" ]; then
echo "Removing quantized model at $save_dir to free disk space."
rm -rf "$save_dir"
fi
print_summary
print_comparison
done # actorder_state
done # model
# ── Final Summary ────────────────────────────────────────────────────────────
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
echo "║ FINAL SUMMARY: $PASSED passed, $FAILED failed out of $TOTAL total evaluations ║"
echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
echo ""
print_summary
print_comparison
echo "Results CSV: $RESULTS_CSV"
echo "Saved models: $MODEL_BASE_DIR/"
echo "Eval outputs: $EVAL_BASE_DIR/"
echo ""
echo "To extract detailed metrics from the log:"
echo " python extract_log_summary.py regression_results.log"
#!/bin/bash
# FP8 Block GPTQ actorder Regression Test Suite
# Compares without-actorder vs with-actorder (actorder=weight) vs
# with-group-actorder (actorder=group) for FP8 block GPTQ quantization
# across models and benchmarks.
#
# Usage:
# ./run_all_tests_fp8.sh 2>&1 | tee regression_results_fp8.log
# python extract_log_summary.py regression_results_fp8.log
set -o pipefail
# Avoid permission errors on shared HF cache files
export HF_DATASETS_CACHE="/tmp/hf_datasets_cache"
mkdir -p "$HF_DATASETS_CACHE"
# ── Configuration ────────────────────────────────────────────────────────────
SCRIPTS=(
"testing/llama3_fp8_block.py"
"testing/qwen25_32b_fp8_block.py"
"testing/qwen3_vl_fp8_block.py"
"testing/llama4_scout_fp8_block.py"
"testing/mixtral_fp8_block.py"
)
MODEL_SHORT_NAMES=(
"Meta-Llama-3-8B-Instruct"
"Qwen2.5-32B-Instruct"
"Qwen3-VL-8B-Instruct"
"Llama-4-Scout-17B-16E-Instruct"
"Mixtral-8x7B-Instruct-v0.1"
)
MODEL_SCHEMES=(
"FP8_BLOCK"
"FP8_BLOCK"
"FP8_BLOCK"
"FP8_BLOCK"
"FP8_BLOCK"
)
# vLLM eval settings per entry: max_model_len,tensor_parallel_size,num_gpus_quant
MODEL_VLLM_ARGS=(
"2048,1,1"
"4096,2,1"
"4096,1,1"
"4096,2,1"
"2048,2,1"
)
# without-actorder: no actorder flag (standard GPTQ)
# with-actorder: actorder=weight
ACTORDER_STATES=("without-actorder" "with-actorder")
EVAL_NAMES=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu")
EVAL_LM_TASKS=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu")
EVAL_FEWSHOT=("5" "5" "0" "5")
EVAL_BACKENDS=("vllm" "vllm" "vllm" "vllm")
EVAL_BASE_DIR="./eval_results"
MODEL_BASE_DIR="./regression_models"
RESULTS_CSV="regression_results_fp8.csv"
mkdir -p "$EVAL_BASE_DIR" "$MODEL_BASE_DIR"
# ── Helper: activate environments ────────────────────────────────────────────
activate_quant_env() {
source /home/HDCharles/rhdev/bin/activate
}
activate_eval_env() {
source /home/HDCharles/vllm/bin/activate
}
# ── Helper: run vLLM evaluation with fallback chain ──────────────────────────
EVAL_BACKEND=""
run_vllm_eval() {
local save_dir=$1
local task=$2
local num_fewshot=$3
local max_model_len=$4
local tp_size=$5
local eval_output_dir=$6
mkdir -p "$eval_output_dir"
EVAL_BACKEND="FAILED"
activate_eval_env
echo " EVAL: $task (fewshot=$num_fewshot, tp=$tp_size, max_len=$max_model_len)"
local chat_args="--apply_chat_template"
if [ "$num_fewshot" -gt 0 ]; then
chat_args="$chat_args --fewshot_as_multiturn"
fi
if [ "$tp_size" -gt 1 ]; then
lm_eval \
--model vllm \
--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,tensor_parallel_size=$tp_size,gpu_memory_utilization=0.85" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp${tp_size}"; return 0; fi
echo " TP=$tp_size failed, trying expert_parallel..."
lm_eval \
--model vllm \
--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enable_expert_parallel=True,gpu_memory_utilization=0.85" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_expert_parallel"; return 0; fi
fi
echo " Trying TP=1..."
lm_eval \
--model vllm \
--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,gpu_memory_utilization=0.85" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp1"; return 0; fi
echo " Trying enforce_eager..."
lm_eval \
--model vllm \
--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enforce_eager=True,gpu_memory_utilization=0.85" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_eager"; return 0; fi
echo " Trying hf backend as last resort..."
lm_eval \
--model hf \
--model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi
EVAL_BACKEND="FAILED"
return 1
}
# ── Helper: run HF-only evaluation ─────────────────────────────────────────
run_hf_eval() {
local save_dir=$1
local task=$2
local num_fewshot=$3
local eval_output_dir=$4
mkdir -p "$eval_output_dir"
EVAL_BACKEND="FAILED"
activate_eval_env
echo " EVAL: $task (fewshot=$num_fewshot, backend=hf)"
local chat_args="--apply_chat_template"
if [ "$num_fewshot" -gt 0 ]; then
chat_args="$chat_args --fewshot_as_multiturn"
fi
lm_eval \
--model hf \
--model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi
EVAL_BACKEND="FAILED"
return 1
}
# ── Helper: extract metric from lm_eval JSON results ────────────────────────
extract_metric() {
local eval_output_dir=$1
local task=$2
local results_json
results_json=$(find "$eval_output_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1)
if [ -z "$results_json" ]; then
echo "N/A"
return
fi
python3 -c "
import json, sys
with open('$results_json') as f:
data = json.load(f)
results = data.get('results', {})
task = '$task'
task_results = None
for key in results:
if task in key:
task_results = results[key]
break
if task_results is None:
print('N/A')
sys.exit()
if 'gsm8k' in task:
val = task_results.get('exact_match,strict-match')
if val is not None:
print(f'{val*100:.2f}%')
else:
print('N/A')
elif 'wikitext' in task:
val = task_results.get('word_perplexity,none')
if val is not None:
print(f'{val:.2f}')
else:
print('N/A')
elif 'mmlu' in task:
val = task_results.get('acc,none')
if val is not None:
print(f'{val*100:.2f}%')
else:
print('N/A')
else:
for k, v in task_results.items():
if 'stderr' not in k and k != 'alias' and isinstance(v, (int, float)):
print(f'{v:.4f}')
sys.exit()
print('N/A')
" 2>/dev/null || echo "N/A"
}
# ── Helper: print current results summary ────────────────────────────────────
print_summary() {
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗"
echo "║ RESULTS SUMMARY (so far) ║"
echo "╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝"
echo ""
if [ -f "$RESULTS_CSV" ]; then
column -t -s',' < "$RESULTS_CSV"
else
echo "(no results yet)"
fi
echo ""
echo "════════════════════════════════════════════════════════════════════════════════════════════════════════"
echo ""
}
# ── Helper: print actorder comparison table ─────────────────────────────────
print_comparison() {
if [ ! -f "$RESULTS_CSV" ]; then
return
fi
python3 - "$RESULTS_CSV" <<'PYEOF'
import csv, sys
csv_path = sys.argv[1]
rows = []
with open(csv_path) as f:
reader = csv.DictReader(f)
for r in reader:
rows.append(r)
if not rows:
sys.exit()
actorder_keys = ["without-actorder", "with-actorder"]
lookup = {}
for r in rows:
key = (r["model"], r["scheme"], r["task"])
lookup.setdefault(key, {})
lookup[key][r["actorder"]] = r["metric"]
entries = [(k, v) for k, v in lookup.items()
if "without-actorder" in v and
any(s in v for s in actorder_keys[1:])]
if not entries:
sys.exit()
def parse_metric(s):
s = s.strip()
if s.endswith("%"):
return float(s[:-1]), True
try:
return float(s), False
except ValueError:
return None, False
def calc_improvement(baseline_str, compare_str, task):
b_val, _ = parse_metric(baseline_str)
c_val, _ = parse_metric(compare_str)
if b_val is None or c_val is None or b_val == 0:
return "N/A"
if "wikitext" in task:
pct = (b_val - c_val) / b_val * 100
else:
pct = (c_val - b_val) / b_val * 100
sign = "+" if pct >= 0 else ""
return f"{sign}{pct:.2f}%"
print("")
print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗")
print("║ ACTORDER COMPARISON (vs without-actorder baseline) ║")
print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝")
print("")
header = (f"{'model':<36} {'scheme':<12} {'task':<18} "
f"{'no-actorder':>14} "
f"{'weight':>14} {'wt vs base':>12}")
print(header)
print("-" * len(header))
for (model, scheme, task), metrics in sorted(entries):
wo = metrics.get("without-actorder", "")
wi = metrics.get("with-actorder", "")
wi_imp = calc_improvement(wo, wi, task) if wo and wi else ""
print(f"{model:<36} {scheme:<12} {task:<18} "
f"{wo:>14} "
f"{wi:>14} {wi_imp:>12}")
print("")
PYEOF
}
# ── Initialize results CSV (preserve previous results) ──────────────────────
if [ -f "$RESULTS_CSV" ]; then
cp "$RESULTS_CSV" "${RESULTS_CSV}.bak"
fi
echo "model,scheme,actorder,task,metric,status,eval_backend,save_dir" > "$RESULTS_CSV"
# ── Main loop ────────────────────────────────────────────────────────────────
TOTAL=0
PASSED=0
FAILED=0
for model_idx in "${!SCRIPTS[@]}"; do
script="${SCRIPTS[$model_idx]}"
model_name="${MODEL_SHORT_NAMES[$model_idx]}"
scheme="${MODEL_SCHEMES[$model_idx]}"
IFS=',' read -r max_model_len tp_size num_gpus_quant <<< "${MODEL_VLLM_ARGS[$model_idx]}"
for actorder_state in "${ACTORDER_STATES[@]}"; do
save_dir="$MODEL_BASE_DIR/${model_name}-${scheme}-${actorder_state}"
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
echo "║ MODEL: $model_name"
echo "║ SCHEME: $scheme"
echo "║ ACTORDER: $actorder_state"
echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
echo ""
# ── Skip entirely if all evals already have results ────
all_evals_cached=true
for eval_idx in "${!EVAL_NAMES[@]}"; do
eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${actorder_state}/${EVAL_NAMES[$eval_idx]}"
if ! find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | grep -q .; then
all_evals_cached=false
break
fi
done
if [ "$all_evals_cached" = true ]; then
echo "All evals already cached, skipping quantization and eval."
for eval_idx in "${!EVAL_NAMES[@]}"; do
eval_name="${EVAL_NAMES[$eval_idx]}"
lm_task="${EVAL_LM_TASKS[$eval_idx]}"
eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${actorder_state}/${eval_name}"
metric_val=$(extract_metric "$eval_dir" "$lm_task")
echo " $eval_name: $metric_val"
echo "$model_name,$scheme,$actorder_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
PASSED=$((PASSED + 1))
TOTAL=$((TOTAL + 1))
done
print_summary
print_comparison
continue
fi
# ── Quantize (skip if model already exists) ────────────
if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then
echo "Quantized model already exists at $save_dir, skipping quantization."
else
activate_quant_env
echo "============================================"
echo "Running: $script (actorder_state=$actorder_state)"
echo "============================================"
actorder_arg=""
if [ "$actorder_state" == "with-actorder" ]; then
actorder_arg="--actorder weight"
fi
if [ "$num_gpus_quant" -gt 1 ]; then
torchrun --nproc_per_node="$num_gpus_quant" "$script" \
$actorder_arg --save-dir "$save_dir" 2>&1
else
python "$script" $actorder_arg --save-dir "$save_dir" 2>&1
fi
quant_status=$?
if [ $quant_status -ne 0 ]; then
echo "QUANTIZATION FAILED for $model_name / $scheme / $actorder_state"
for eval_name in "${EVAL_NAMES[@]}"; do
echo "$model_name,$scheme,$actorder_state,$eval_name,N/A,QUANT_FAILED,N/A,$save_dir" >> "$RESULTS_CSV"
done
FAILED=$((FAILED + ${#EVAL_NAMES[@]}))
TOTAL=$((TOTAL + ${#EVAL_NAMES[@]}))
print_summary
print_comparison
continue
fi
fi
# ── Clear GPU memory before eval ─────────────────────────
python3 -c "import torch; torch.cuda.empty_cache(); [torch.cuda.reset_peak_memory_stats(i) for i in range(torch.cuda.device_count())]" 2>/dev/null
# ── Evaluate ─────────────────────────────────────────────
for eval_idx in "${!EVAL_NAMES[@]}"; do
eval_name="${EVAL_NAMES[$eval_idx]}"
lm_task="${EVAL_LM_TASKS[$eval_idx]}"
fewshot="${EVAL_FEWSHOT[$eval_idx]}"
backend="${EVAL_BACKENDS[$eval_idx]}"
eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${actorder_state}/${eval_name}"
TOTAL=$((TOTAL + 1))
existing_result=$(find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1)
if [ -n "$existing_result" ]; then
metric_val=$(extract_metric "$eval_dir" "$lm_task")
echo " EVAL: $eval_name — skipping, previous result found: $metric_val"
echo "$model_name,$scheme,$actorder_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
PASSED=$((PASSED + 1))
continue
fi
if [ "$backend" == "hf" ]; then
run_hf_eval "$save_dir" "$lm_task" "$fewshot" "$eval_dir"
else
run_vllm_eval "$save_dir" "$lm_task" "$fewshot" "$max_model_len" "$tp_size" "$eval_dir"
fi
eval_status=$?
if [ $eval_status -eq 0 ]; then
metric_val=$(extract_metric "$eval_dir" "$lm_task")
echo "$model_name,$scheme,$actorder_state,$eval_name,$metric_val,PASSED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
PASSED=$((PASSED + 1))
else
echo "$model_name,$scheme,$actorder_state,$eval_name,N/A,FAILED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
FAILED=$((FAILED + 1))
fi
done
# ── Clean up model to free disk space ────────────────────
if [ -d "$save_dir" ]; then
echo "Removing quantized model at $save_dir to free disk space."
rm -rf "$save_dir"
fi
print_summary
print_comparison
done # actorder_state
done # model
# ── Final Summary ────────────────────────────────────────────────────────────
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
echo "║ FINAL SUMMARY: $PASSED passed, $FAILED failed out of $TOTAL total evaluations ║"
echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
echo ""
print_summary
print_comparison
echo "Results CSV: $RESULTS_CSV"
echo "Saved models: $MODEL_BASE_DIR/"
echo "Eval outputs: $EVAL_BASE_DIR/"
echo ""
echo "To extract detailed metrics from the log:"
echo " python extract_log_summary.py regression_results_fp8.log"
#!/bin/bash
# W4A16 GPTQ actorder Regression Test Suite
# Compares without-actorder vs with-actorder (actorder=weight) vs
# with-group-actorder (actorder=group) for W4A16 GPTQ quantization.
#
# Usage:
# ./run_all_tests_w4a16.sh 2>&1 | tee regression_results_w4a16.log
# python extract_log_summary.py regression_results_w4a16.log
set -o pipefail
# Avoid permission errors on shared HF cache files
export HF_DATASETS_CACHE="/tmp/hf_datasets_cache"
mkdir -p "$HF_DATASETS_CACHE"
# ── Configuration ────────────────────────────────────────────────────────────
SCRIPTS=(
"testing/llama3_w4a16_gptq.py"
)
MODEL_SHORT_NAMES=(
"Meta-Llama-3-8B-Instruct"
)
MODEL_SCHEMES=(
"W4A16"
)
# vLLM eval settings per entry: max_model_len,tensor_parallel_size,num_gpus_quant
MODEL_VLLM_ARGS=(
"2048,1,1"
)
# without-actorder: no actorder flag (standard GPTQ)
# with-actorder: actorder=weight
# with-group-actorder: actorder=group
ACTORDER_STATES=("without-actorder" "with-actorder" "with-group-actorder")
EVAL_NAMES=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu")
EVAL_LM_TASKS=("gsm8k" "gsm8k_platinum" "wikitext" "mmlu")
EVAL_FEWSHOT=("5" "5" "0" "5")
EVAL_BACKENDS=("vllm" "vllm" "vllm" "vllm")
EVAL_BASE_DIR="./eval_results"
MODEL_BASE_DIR="./regression_models"
RESULTS_CSV="regression_results_w4a16.csv"
mkdir -p "$EVAL_BASE_DIR" "$MODEL_BASE_DIR"
# ── Helper: activate environments ────────────────────────────────────────────
activate_quant_env() {
source /home/HDCharles/rhdev/bin/activate
}
activate_eval_env() {
source /home/HDCharles/vllm/bin/activate
}
# ── Helper: run vLLM evaluation with fallback chain ──────────────────────────
EVAL_BACKEND=""
run_vllm_eval() {
local save_dir=$1
local task=$2
local num_fewshot=$3
local max_model_len=$4
local tp_size=$5
local eval_output_dir=$6
mkdir -p "$eval_output_dir"
EVAL_BACKEND="FAILED"
activate_eval_env
echo " EVAL: $task (fewshot=$num_fewshot, tp=$tp_size, max_len=$max_model_len)"
local chat_args="--apply_chat_template"
if [ "$num_fewshot" -gt 0 ]; then
chat_args="$chat_args --fewshot_as_multiturn"
fi
if [ "$tp_size" -gt 1 ]; then
lm_eval \
--model vllm \
--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,tensor_parallel_size=$tp_size,gpu_memory_utilization=0.85" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp${tp_size}"; return 0; fi
echo " TP=$tp_size failed, trying expert_parallel..."
lm_eval \
--model vllm \
--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enable_expert_parallel=True,gpu_memory_utilization=0.85" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_expert_parallel"; return 0; fi
fi
echo " Trying TP=1..."
lm_eval \
--model vllm \
--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,gpu_memory_utilization=0.85" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_tp1"; return 0; fi
echo " Trying enforce_eager..."
lm_eval \
--model vllm \
--model_args "pretrained=$save_dir,dtype=auto,max_model_len=$max_model_len,add_bos_token=True,enforce_eager=True,gpu_memory_utilization=0.85" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="vllm_eager"; return 0; fi
echo " Trying hf backend as last resort..."
lm_eval \
--model hf \
--model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi
EVAL_BACKEND="FAILED"
return 1
}
# ── Helper: run HF-only evaluation ─────────────────────────────────────────
run_hf_eval() {
local save_dir=$1
local task=$2
local num_fewshot=$3
local eval_output_dir=$4
mkdir -p "$eval_output_dir"
EVAL_BACKEND="FAILED"
activate_eval_env
echo " EVAL: $task (fewshot=$num_fewshot, backend=hf)"
local chat_args="--apply_chat_template"
if [ "$num_fewshot" -gt 0 ]; then
chat_args="$chat_args --fewshot_as_multiturn"
fi
lm_eval \
--model hf \
--model_args "pretrained=$save_dir,dtype=auto,add_bos_token=True" \
--tasks "$task" \
--num_fewshot "$num_fewshot" \
--batch_size auto \
$chat_args \
--output_path "$eval_output_dir" 2>&1
if [ $? -eq 0 ]; then EVAL_BACKEND="hf"; return 0; fi
EVAL_BACKEND="FAILED"
return 1
}
# ── Helper: extract metric from lm_eval JSON results ────────────────────────
extract_metric() {
local eval_output_dir=$1
local task=$2
local results_json
results_json=$(find "$eval_output_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1)
if [ -z "$results_json" ]; then
echo "N/A"
return
fi
python3 -c "
import json, sys
with open('$results_json') as f:
data = json.load(f)
results = data.get('results', {})
task = '$task'
task_results = None
for key in results:
if task in key:
task_results = results[key]
break
if task_results is None:
print('N/A')
sys.exit()
if 'gsm8k' in task:
val = task_results.get('exact_match,strict-match')
if val is not None:
print(f'{val*100:.2f}%')
else:
print('N/A')
elif 'wikitext' in task:
val = task_results.get('word_perplexity,none')
if val is not None:
print(f'{val:.2f}')
else:
print('N/A')
elif 'mmlu' in task:
val = task_results.get('acc,none')
if val is not None:
print(f'{val*100:.2f}%')
else:
print('N/A')
else:
for k, v in task_results.items():
if 'stderr' not in k and k != 'alias' and isinstance(v, (int, float)):
print(f'{v:.4f}')
sys.exit()
print('N/A')
" 2>/dev/null || echo "N/A"
}
# ── Helper: print current results summary ────────────────────────────────────
print_summary() {
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════════════════════════════╗"
echo "║ RESULTS SUMMARY (so far) ║"
echo "╚══════════════════════════════════════════════════════════════════════════════════════════════════════╝"
echo ""
if [ -f "$RESULTS_CSV" ]; then
column -t -s',' < "$RESULTS_CSV"
else
echo "(no results yet)"
fi
echo ""
echo "════════════════════════════════════════════════════════════════════════════════════════════════════════"
echo ""
}
# ── Helper: print actorder comparison table ─────────────────────────────────
print_comparison() {
if [ ! -f "$RESULTS_CSV" ]; then
return
fi
python3 - "$RESULTS_CSV" <<'PYEOF'
import csv, sys
csv_path = sys.argv[1]
rows = []
with open(csv_path) as f:
reader = csv.DictReader(f)
for r in reader:
rows.append(r)
if not rows:
sys.exit()
actorder_keys = ["without-actorder", "with-actorder", "with-group-actorder"]
lookup = {}
for r in rows:
key = (r["model"], r["scheme"], r["task"])
lookup.setdefault(key, {})
lookup[key][r["actorder"]] = r["metric"]
entries = [(k, v) for k, v in lookup.items()
if "without-actorder" in v and
any(s in v for s in actorder_keys[1:])]
if not entries:
sys.exit()
def parse_metric(s):
s = s.strip()
if s.endswith("%"):
return float(s[:-1]), True
try:
return float(s), False
except ValueError:
return None, False
def calc_improvement(baseline_str, compare_str, task):
b_val, _ = parse_metric(baseline_str)
c_val, _ = parse_metric(compare_str)
if b_val is None or c_val is None or b_val == 0:
return "N/A"
if "wikitext" in task:
pct = (b_val - c_val) / b_val * 100
else:
pct = (c_val - b_val) / b_val * 100
sign = "+" if pct >= 0 else ""
return f"{sign}{pct:.2f}%"
print("")
print("╔════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╗")
print("║ ACTORDER COMPARISON (vs without-actorder baseline) ║")
print("╚════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝")
print("")
header = (f"{'model':<36} {'scheme':<12} {'task':<18} "
f"{'no-actorder':>14} "
f"{'weight':>14} {'wt vs base':>12} "
f"{'group':>14} {'grp vs base':>12}")
print(header)
print("-" * len(header))
for (model, scheme, task), metrics in sorted(entries):
wo = metrics.get("without-actorder", "")
wi = metrics.get("with-actorder", "")
wg = metrics.get("with-group-actorder", "")
wi_imp = calc_improvement(wo, wi, task) if wo and wi else ""
wg_imp = calc_improvement(wo, wg, task) if wo and wg else ""
print(f"{model:<36} {scheme:<12} {task:<18} "
f"{wo:>14} "
f"{wi:>14} {wi_imp:>12} "
f"{wg:>14} {wg_imp:>12}")
print("")
PYEOF
}
# ── Initialize results CSV (preserve previous results) ──────────────────────
if [ -f "$RESULTS_CSV" ]; then
cp "$RESULTS_CSV" "${RESULTS_CSV}.bak"
fi
echo "model,scheme,actorder,task,metric,status,eval_backend,save_dir" > "$RESULTS_CSV"
# ── Main loop ────────────────────────────────────────────────────────────────
TOTAL=0
PASSED=0
FAILED=0
for model_idx in "${!SCRIPTS[@]}"; do
script="${SCRIPTS[$model_idx]}"
model_name="${MODEL_SHORT_NAMES[$model_idx]}"
scheme="${MODEL_SCHEMES[$model_idx]}"
IFS=',' read -r max_model_len tp_size num_gpus_quant <<< "${MODEL_VLLM_ARGS[$model_idx]}"
for actorder_state in "${ACTORDER_STATES[@]}"; do
save_dir="$MODEL_BASE_DIR/${model_name}-${scheme}-${actorder_state}"
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
echo "║ MODEL: $model_name"
echo "║ SCHEME: $scheme"
echo "║ ACTORDER: $actorder_state"
echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
echo ""
# ── Skip entirely if all evals already have results ────
all_evals_cached=true
for eval_idx in "${!EVAL_NAMES[@]}"; do
eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${actorder_state}/${EVAL_NAMES[$eval_idx]}"
if ! find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | grep -q .; then
all_evals_cached=false
break
fi
done
if [ "$all_evals_cached" = true ]; then
echo "All evals already cached, skipping quantization and eval."
for eval_idx in "${!EVAL_NAMES[@]}"; do
eval_name="${EVAL_NAMES[$eval_idx]}"
lm_task="${EVAL_LM_TASKS[$eval_idx]}"
eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${actorder_state}/${eval_name}"
metric_val=$(extract_metric "$eval_dir" "$lm_task")
echo " $eval_name: $metric_val"
echo "$model_name,$scheme,$actorder_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
PASSED=$((PASSED + 1))
TOTAL=$((TOTAL + 1))
done
print_summary
print_comparison
continue
fi
# ── Quantize (skip if model already exists) ────────────
if [ -d "$save_dir" ] && [ -f "$save_dir/config.json" ]; then
echo "Quantized model already exists at $save_dir, skipping quantization."
else
activate_quant_env
echo "============================================"
echo "Running: $script (actorder_state=$actorder_state)"
echo "============================================"
actorder_arg=""
if [ "$actorder_state" == "with-actorder" ]; then
actorder_arg="--actorder weight"
elif [ "$actorder_state" == "with-group-actorder" ]; then
actorder_arg="--actorder group"
fi
if [ "$num_gpus_quant" -gt 1 ]; then
torchrun --nproc_per_node="$num_gpus_quant" "$script" \
$actorder_arg --save-dir "$save_dir" 2>&1
else
python "$script" $actorder_arg --save-dir "$save_dir" 2>&1
fi
quant_status=$?
if [ $quant_status -ne 0 ]; then
echo "QUANTIZATION FAILED for $model_name / $scheme / $actorder_state"
for eval_name in "${EVAL_NAMES[@]}"; do
echo "$model_name,$scheme,$actorder_state,$eval_name,N/A,QUANT_FAILED,N/A,$save_dir" >> "$RESULTS_CSV"
done
FAILED=$((FAILED + ${#EVAL_NAMES[@]}))
TOTAL=$((TOTAL + ${#EVAL_NAMES[@]}))
print_summary
print_comparison
continue
fi
fi
# ── Clear GPU memory before eval ─────────────────────────
python3 -c "import torch; torch.cuda.empty_cache(); [torch.cuda.reset_peak_memory_stats(i) for i in range(torch.cuda.device_count())]" 2>/dev/null
# ── Evaluate ─────────────────────────────────────────────
for eval_idx in "${!EVAL_NAMES[@]}"; do
eval_name="${EVAL_NAMES[$eval_idx]}"
lm_task="${EVAL_LM_TASKS[$eval_idx]}"
fewshot="${EVAL_FEWSHOT[$eval_idx]}"
backend="${EVAL_BACKENDS[$eval_idx]}"
eval_dir="$EVAL_BASE_DIR/${model_name}-${scheme}-${actorder_state}/${eval_name}"
TOTAL=$((TOTAL + 1))
existing_result=$(find "$eval_dir" -name "results_*.json" -type f 2>/dev/null | sort | tail -1)
if [ -n "$existing_result" ]; then
metric_val=$(extract_metric "$eval_dir" "$lm_task")
echo " EVAL: $eval_name — skipping, previous result found: $metric_val"
echo "$model_name,$scheme,$actorder_state,$eval_name,$metric_val,PASSED,cached,$save_dir" >> "$RESULTS_CSV"
PASSED=$((PASSED + 1))
continue
fi
if [ "$backend" == "hf" ]; then
run_hf_eval "$save_dir" "$lm_task" "$fewshot" "$eval_dir"
else
run_vllm_eval "$save_dir" "$lm_task" "$fewshot" "$max_model_len" "$tp_size" "$eval_dir"
fi
eval_status=$?
if [ $eval_status -eq 0 ]; then
metric_val=$(extract_metric "$eval_dir" "$lm_task")
echo "$model_name,$scheme,$actorder_state,$eval_name,$metric_val,PASSED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
PASSED=$((PASSED + 1))
else
echo "$model_name,$scheme,$actorder_state,$eval_name,N/A,FAILED,$EVAL_BACKEND,$save_dir" >> "$RESULTS_CSV"
FAILED=$((FAILED + 1))
fi
done
# ── Clean up model to free disk space ────────────────────
if [ -d "$save_dir" ]; then
echo "Removing quantized model at $save_dir to free disk space."
rm -rf "$save_dir"
fi
print_summary
print_comparison
done # actorder_state
done # model
# ── Final Summary ────────────────────────────────────────────────────────────
echo ""
echo "╔══════════════════════════════════════════════════════════════════════════════════════════╗"
echo "║ FINAL SUMMARY: $PASSED passed, $FAILED failed out of $TOTAL total evaluations ║"
echo "╚══════════════════════════════════════════════════════════════════════════════════════════╝"
echo ""
print_summary
print_comparison
echo "Results CSV: $RESULTS_CSV"
echo "Saved models: $MODEL_BASE_DIR/"
echo "Eval outputs: $EVAL_BASE_DIR/"
echo ""
echo "To extract detailed metrics from the log:"
echo " python extract_log_summary.py regression_results_w4a16.log"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment