Skip to content

Instantly share code, notes, and snippets.

@davidberenstein1957
Last active May 16, 2025 09:04
Show Gist options
  • Save davidberenstein1957/8e261cd98b593566c23c279c00a69db8 to your computer and use it in GitHub Desktop.
Save davidberenstein1957/8e261cd98b593566c23c279c00a69db8 to your computer and use it in GitHub Desktop.
Optimize AI Models QWEN3 Inference
# /// script
# requires-python = ">=3.11,<3.12"
# dependencies = [
# "pruna"
# ]
# ///
from transformers import pipeline
from pruna import SmashConfig, smash
model_name = "Qwen/Qwen3-32B"
pipe = pipeline("text-generation", model=model_name)
# Initialize the SmashConfig
smash_config = SmashConfig(cache_dir_prefix="/efs/smash_cache")
smash_config["quantizer"] = "hqq"
smash_config["hqq_weight_bits"] = 4
smash_config["hqq_compute_dtype"] = "torch.bfloat16"
smash_config["compiler"] = "torch_compile"
smash_config["torch_compile_fullgraph"] = True
smash_config["torch_compile_dynamic"] = True
# Smash the model
model = smash(
model=pipe.model,
smash_config=smash_config,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment