After following these instructions to run on CPU:
https://dev.to/nithinibhandari1999/how-to-run-llama-2-on-your-local-computer-42g1
The modified Python code to run the inference on a GPU is here.
After following these instructions to run on CPU:
https://dev.to/nithinibhandari1999/how-to-run-llama-2-on-your-local-computer-42g1
The modified Python code to run the inference on a GPU is here.
import time | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
timeStart = time.time() | |
tokenizer = AutoTokenizer.from_pretrained( | |
"meta-llama/Llama-2-7b-chat-hf" | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
"meta-llama/Llama-2-7b-chat-hf", | |
torch_dtype=torch.bfloat16, | |
device_map="auto", | |
trust_remote_code=True) | |
print("Load model time: ", -timeStart + time.time()) | |
while(True): | |
input_str = input('Enter: ') | |
input_token_length = input('Enter length: ') | |
if(input_str == 'exit'): | |
break | |
timeStart = time.time() | |
inputs = tokenizer.encode( | |
input_str, | |
return_tensors="pt" | |
) | |
inputs = inputs.to('cuda') | |
outputs = model.generate( | |
inputs, | |
max_new_tokens=int(input_token_length), | |
) | |
output_str = tokenizer.decode(outputs[0]) | |
print(output_str) | |
print("Time taken: ", -timeStart + time.time()) |