Created
September 15, 2025 17:34
-
-
Save andrewor14/6360dd69b5784c71c46e80c14f53e6b6 to your computer and use it in GitHub Desktop.
Unsloth QAT LoRA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from unsloth import FastLanguageModel | |
| import torch | |
| from torchao.quantization import Int4WeightOnlyConfig | |
| from transformers import AutoModelForCausalLM, TextStreamer, TorchAoConfig | |
| qat_scheme = "int4" | |
| save_output_path = "/tmp/unsloth_model" | |
| max_seq_length = 2048 | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name = "unsloth/Meta-Llama-3.1-8B", | |
| max_seq_length = max_seq_length, | |
| dtype = torch.bfloat16, | |
| load_in_4bit = False, | |
| full_finetuning = False, | |
| ) | |
| model = FastLanguageModel.get_peft_model( | |
| model, | |
| r = 16, | |
| target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", | |
| "gate_proj", "up_proj", "down_proj",], | |
| lora_alpha = 16, | |
| lora_dropout = 0, | |
| bias = "none", | |
| use_gradient_checkpointing = "unsloth", | |
| random_state = 3407, | |
| use_rslora = False, | |
| loftq_config = None, | |
| qat_scheme = qat_scheme, | |
| ) | |
| # ============ | |
| # Data prep | | |
| # ============ | |
| alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
| ### Instruction: | |
| {} | |
| ### Input: | |
| {} | |
| ### Response: | |
| {}""" | |
| EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN | |
| def formatting_prompts_func(examples): | |
| instructions = examples["instruction"] | |
| inputs = examples["input"] | |
| outputs = examples["output"] | |
| texts = [] | |
| for instruction, input, output in zip(instructions, inputs, outputs): | |
| # Must add EOS_TOKEN, otherwise your generation will go on forever! | |
| text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN | |
| texts.append(text) | |
| return { "text" : texts, } | |
| pass | |
| from datasets import load_dataset | |
| dataset = load_dataset("yahma/alpaca-cleaned", split = "train") | |
| dataset = dataset.map(formatting_prompts_func, batched = True,) | |
| # ======== | |
| # Train | | |
| # ======== | |
| from trl import SFTConfig, SFTTrainer | |
| trainer = SFTTrainer( | |
| model = model, | |
| tokenizer = tokenizer, | |
| train_dataset = dataset, | |
| dataset_text_field = "text", | |
| max_seq_length = max_seq_length, | |
| packing = False, # Can make training 5x faster for short sequences. | |
| args = SFTConfig( | |
| per_device_train_batch_size = 16, | |
| gradient_accumulation_steps = 1, | |
| warmup_steps = 5, | |
| num_train_epochs = 1, # Set this for 1 full training run. | |
| max_steps = 10, | |
| learning_rate = 4e-5, | |
| logging_steps = 1, | |
| optim = "adamw_8bit", | |
| weight_decay = 0.01, | |
| lr_scheduler_type = "linear", | |
| seed = 3407, | |
| output_dir = "outputs", | |
| report_to = "none", # Use this for WandB etc | |
| ), | |
| ) | |
| trainer_stats = trainer.train() | |
| # ============= | |
| # Model save | | |
| # ============= | |
| model.save_pretrained(save_output_path) | |
| tokenizer.save_pretrained(save_output_path) | |
| quantization_config = TorchAoConfig(Int4WeightOnlyConfig()) | |
| quantized_model = AutoModelForCausalLM.from_pretrained( | |
| save_output_path, | |
| torch_dtype="auto", | |
| device_map="auto", | |
| quantization_config=quantization_config, | |
| ) | |
| quantized_model.save_pretrained(save_output_path + "_quantized", safe_serialization=False) | |
| tokenizer.save_pretrained(save_output_path + "_quantized") | |
| # ============ | |
| # Fibonacci | | |
| # ============ | |
| inputs = tokenizer( | |
| [ | |
| alpaca_prompt.format( | |
| "Continue the fibonnaci sequence.", # instruction | |
| "1, 1, 2, 3, 5, 8", # input | |
| "", # output - leave this blank for generation! | |
| ) | |
| ], return_tensors = "pt").to("cuda") | |
| text_streamer = TextStreamer(tokenizer) | |
| _ = quantized_model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment