Skip to content

Instantly share code, notes, and snippets.

@ariG23498
Created March 6, 2025 07:39
Show Gist options
  • Save ariG23498/daa15a3066223981543027975d779841 to your computer and use it in GitHub Desktop.
Save ariG23498/daa15a3066223981543027975d779841 to your computer and use it in GitHub Desktop.
QWQ 32B NF4 Inference
!pip install -Uq transformers bitsandbytes
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
)
model_name = "ariG23498/QwQ-32B-nf4"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
device_map="auto",
)
prompt = "How many r's are in the word \"strawberry\""
messages = [
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
with torch.no_grad():
generated_ids = model.generate(
**model_inputs,
max_new_tokens=32768,
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
@ariG23498
Copy link
Author

To create the quantized model

model_name = "Qwen/QwQ-32B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=nf4_config,
)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment