Last active
February 21, 2024 02:52
-
-
Save zhangw/5886f9f83f54a2685a8f97d5a5e941ab to your computer and use it in GitHub Desktop.
load-codefuse-codellama-34B-4bits
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import torch | |
import time | |
from modelscope import AutoTokenizer, snapshot_download | |
from auto_gptq import AutoGPTQForCausalLM | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
def load_model_tokenizer(model_path): | |
""" | |
Load model and tokenizer based on the given model name or local path of downloaded model. | |
""" | |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False, lagecy=False) | |
tokenizer.padding_side = "left" | |
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<unk>") | |
tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("</s>") | |
print(f"tokenizer loaded: {tokenizer}") | |
model = AutoGPTQForCausalLM.from_quantized(model_path, inject_fused_attention=False, inject_fused_mlp=False, use_safetensors=False, use_cuda_fp16=True, disable_exllama=False, device_map='auto') | |
print(f"model loaded: {model}") | |
return model, tokenizer | |
def inference(model, tokenizer, prompt): | |
""" | |
Uset the given model and tokenizer to generate an answer for the speicifed prompt. | |
""" | |
st = time.time() | |
prompt = prompt if prompt.endswith('\n') else f'{prompt}\n' | |
inputs = f"<|role_start|>human<|role_end|>{prompt}<|role_start|>bot<|role_end|>" | |
input_ids = tokenizer.encode(inputs, return_tensors="pt", padding=True, add_special_tokens=False).to("cuda") | |
with torch.no_grad(): | |
generated_ids = model.generate(input_ids=input_ids, top_p=0.95, temperature=0.1, do_sample=True, max_new_tokens=512, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id) | |
print(f'generated tokens num is {len(generated_ids[0][input_ids.size(1):])}') | |
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) | |
print(f'generate text is {outputs[0][len(inputs): ]}') | |
latency = time.time() - st | |
print('latency is {} seconds'.format(latency)) | |
if __name__ == "__main__": | |
model_path = snapshot_download('codefuse-ai/CodeFuse-CodeLlama-34B-4bits', revision='v1.0.0') | |
print(f"model_path: {model_path}") | |
model, tokenizer = load_model_tokenizer(model_path) | |
prompt = 'Please write a QuickSort program in Python' | |
inference(model, tokenizer, prompt) | |
prompt = 'SpringBoot的程序,如何进行启动速度的优化?' | |
inference(model, tokenizer, prompt) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment