Skip to content

Instantly share code, notes, and snippets.

@mendhak
Created July 24, 2023 22:13
Show Gist options
  • Save mendhak/673ee5c0a95b1076c33ca2766d314a28 to your computer and use it in GitHub Desktop.
Save mendhak/673ee5c0a95b1076c33ca2766d314a28 to your computer and use it in GitHub Desktop.
Run Llama2 on GPU
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
timeStart = time.time()
tokenizer = AutoTokenizer.from_pretrained(
"meta-llama/Llama-2-7b-chat-hf"
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-chat-hf",
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True)
print("Load model time: ", -timeStart + time.time())
while(True):
input_str = input('Enter: ')
input_token_length = input('Enter length: ')
if(input_str == 'exit'):
break
timeStart = time.time()
inputs = tokenizer.encode(
input_str,
return_tensors="pt"
)
inputs = inputs.to('cuda')
outputs = model.generate(
inputs,
max_new_tokens=int(input_token_length),
)
output_str = tokenizer.decode(outputs[0])
print(output_str)
print("Time taken: ", -timeStart + time.time())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment