Skip to content

Instantly share code, notes, and snippets.

@andrewor14
Created September 15, 2025 17:34
Show Gist options
  • Select an option

  • Save andrewor14/6360dd69b5784c71c46e80c14f53e6b6 to your computer and use it in GitHub Desktop.

Select an option

Save andrewor14/6360dd69b5784c71c46e80c14f53e6b6 to your computer and use it in GitHub Desktop.
Unsloth QAT LoRA
from unsloth import FastLanguageModel
import torch
from torchao.quantization import Int4WeightOnlyConfig
from transformers import AutoModelForCausalLM, TextStreamer, TorchAoConfig
qat_scheme = "int4"
save_output_path = "/tmp/unsloth_model"
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/Meta-Llama-3.1-8B",
max_seq_length = max_seq_length,
dtype = torch.bfloat16,
load_in_4bit = False,
full_finetuning = False,
)
model = FastLanguageModel.get_peft_model(
model,
r = 16,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 16,
lora_dropout = 0,
bias = "none",
use_gradient_checkpointing = "unsloth",
random_state = 3407,
use_rslora = False,
loftq_config = None,
qat_scheme = qat_scheme,
)
# ============
# Data prep |
# ============
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
texts = []
for instruction, input, output in zip(instructions, inputs, outputs):
# Must add EOS_TOKEN, otherwise your generation will go on forever!
text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
texts.append(text)
return { "text" : texts, }
pass
from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
# ========
# Train |
# ========
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = dataset,
dataset_text_field = "text",
max_seq_length = max_seq_length,
packing = False, # Can make training 5x faster for short sequences.
args = SFTConfig(
per_device_train_batch_size = 16,
gradient_accumulation_steps = 1,
warmup_steps = 5,
num_train_epochs = 1, # Set this for 1 full training run.
max_steps = 10,
learning_rate = 4e-5,
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = "outputs",
report_to = "none", # Use this for WandB etc
),
)
trainer_stats = trainer.train()
# =============
# Model save |
# =============
model.save_pretrained(save_output_path)
tokenizer.save_pretrained(save_output_path)
quantization_config = TorchAoConfig(Int4WeightOnlyConfig())
quantized_model = AutoModelForCausalLM.from_pretrained(
save_output_path,
torch_dtype="auto",
device_map="auto",
quantization_config=quantization_config,
)
quantized_model.save_pretrained(save_output_path + "_quantized", safe_serialization=False)
tokenizer.save_pretrained(save_output_path + "_quantized")
# ============
# Fibonacci |
# ============
inputs = tokenizer(
[
alpaca_prompt.format(
"Continue the fibonnaci sequence.", # instruction
"1, 1, 2, 3, 5, 8", # input
"", # output - leave this blank for generation!
)
], return_tensors = "pt").to("cuda")
text_streamer = TextStreamer(tokenizer)
_ = quantized_model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment