Created
July 18, 2023 20:30
-
-
Save stevenhao/4cb2adbb6ec37b16ea2e292ba4a13177 to your computer and use it in GitHub Desktop.
modal script that creates llama2_image and defines a function that can run inference on it
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from modal import Image, Stub, Secret, gpu | |
from pathlib import Path | |
import os | |
MODEL_PATH = "/model" | |
def download_models(): | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
token = os.environ["HUGGINGFACE_TOKEN"] | |
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token) | |
tokenizer.save_pretrained(MODEL_PATH) | |
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=token) | |
model.save_pretrained(MODEL_PATH) | |
## adapted from https://github.com/modal-labs/doppel-bot/blob/main/src/common.py | |
# versions might be out of date | |
llama2_image = ( | |
Image.micromamba() | |
.micromamba_install( | |
"cudatoolkit=11.7", | |
"cudnn=8.1.0", | |
"cuda-nvcc", | |
channels=["conda-forge", "nvidia"], | |
) | |
.apt_install("git") | |
.pip_install( | |
"accelerate==0.18.0", | |
"bitsandbytes==0.37.0", | |
"bitsandbytes-cuda117==0.26.0.post2", | |
"datasets==2.10.1", | |
"fire==0.5.0", | |
"gradio==3.23.0", | |
"peft @ git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08", | |
"transformers @ git+https://github.com/huggingface/transformers.git@a92e0ad2e20ef4ce28410b5e05c5d63a5a304e65", | |
"torch==2.0.0", | |
"torchvision==0.15.1", | |
"sentencepiece==0.1.97", | |
) | |
.run_function(download_models, memory=32768, secret=Secret.from_name("hugging-face"), timeout=3600) | |
) | |
stub = Stub(name="llama2", image=llama2_image) | |
@stub.function( | |
gpu=gpu.A100(memory=40), | |
) | |
def main(): | |
""" | |
run this function: modal run modal_llama2::main | |
prereqs: | |
- modal hugging-face secret must be configured correctly | |
- you must have access to https://huggingface.co/meta-llama/Llama-2-7b-hf (request access on hugging face & https://ai.meta.com/resources/models-and-libraries/llama-downloads; took me ~1 hr to get approved) | |
first run will need to create the image, which takes ~20 mins (downloading ~30 GB from huggingface) | |
later runs take < 1 min | |
see this notebook for other things you can do with the model: https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb | |
""" | |
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM | |
import torch | |
load_8bit = False | |
device = "cuda" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) | |
model: LlamaForCausalLM = AutoModelForCausalLM.from_pretrained( | |
MODEL_PATH, | |
load_in_8bit=load_8bit, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
) | |
model.eval() | |
from transformers import GenerationConfig | |
# prompt copied from https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb | |
prompt = """ | |
Summarize this dialog: | |
A: Hi Tom, are you busy tomorrow’s afternoon? | |
B: I’m pretty sure I am. What’s up? | |
A: Can you go with me to the animal shelter?. | |
B: What do you want to do? | |
A: I want to get a puppy for my son. | |
B: That will make him so happy. | |
A: Yeah, we’ve discussed it many times. I think he’s ready now. | |
B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) | |
A: I'll get him one of those little dogs. | |
B: One that won't grow up too big;-) | |
A: And eat too much;-)) | |
B: Do you know which one he would like? | |
A: Oh, yes, I took him there last Monday. He showed me one that he really liked. | |
B: I bet you had to drag him away. | |
A: He wanted to take it home right away ;-). | |
B: I wonder what he'll name it. | |
A: He said he’d name it after his dead hamster – Lemmy - he's a great Motorhead fan :-))) | |
--- | |
Summary: | |
""" | |
inputs = tokenizer(prompt, return_tensors="pt") | |
input_ids = inputs["input_ids"].to(device) | |
# tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0]) | |
# print(tokens) | |
generation_config = GenerationConfig() | |
with torch.no_grad(): | |
generation_output = model.generate( | |
input_ids=input_ids, | |
generation_config=generation_config, | |
# parameters below are set arbitrarily; a lot are just defaults | |
return_dict_in_generate=True, | |
output_scores=True, | |
do_sample=True, | |
temperature=0.3, | |
top_p=0.85, | |
top_k=40, | |
num_beams=1, | |
max_new_tokens=600, | |
repetition_penalty=1.2, | |
) | |
s = generation_output.sequences[0] | |
run_output = tokenizer.decode(s) | |
print("Run output:", run_output) | |
return run_output |
hey @stevenhao i tried this on a larger raw text, its just doing text completion, on using bigger params model it starts to follow instructions more properly
any idea why?
@haiderasad I'm not 100% sure, but I think it's cause this gist downloads the meta-llama/Llama-2-7b
model, which is just the pre-trained base model and hasn't been fine-tuned / aligned yet. So all it can do is text completion.
If you want something that follows instructions, try one of the finetuned ones like meta-llama/Llama-2-7b-chat
.
No idea why using a bigger params model would cause it to follow instructions.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Here's the output i got: