Created
December 4, 2023 13:46
-
-
Save younesbelkada/d92f6bb7a4a3f0b78ff975f4cae726a9 to your computer and use it in GitHub Desktop.
Run autoawq + fused modules using HF transformers on a custom model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoModelForCausalLM, AwqConfig | |
model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ" | |
code_revision = "f1b2cd1b7459ceecfdc1fac5bb8725f13707c589" | |
quantization_config = AwqConfig( | |
bits=4, | |
fuse_max_seq_len=512, | |
modules_to_fuse={ | |
"attention": ["q_proj", "k_proj", "v_proj", "o_proj"], | |
"layernorm": ["ln1", "ln2", "norm"], | |
"mlp": ["gate_proj", "up_proj", "down_proj"], | |
"use_alibi": False, | |
"num_attention_heads": 56, | |
"num_key_value_heads": 8, | |
"hidden_size": 7168, | |
}, | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
quantization_config=quantization_config, | |
trust_remote_code=True, | |
device_map="auto", | |
revision=code_revision | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment