Created
September 20, 2023 23:50
-
-
Save flexchar/e45459fd590dae97a54511b452f1b6a9 to your computer and use it in GitHub Desktop.
Run BLOOM Chat 176B Model on Serverless
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import uuid | |
from modal import Image, method, Stub, web_endpoint, gpu | |
# This example showcases running BLOOM Chat 176B 4bit quant on 5 serverless GPUs. | |
# It is possible to run original BLOOM model as well. | |
# Big kudos to TheBloke for creating the quants. | |
# https://huggingface.co/TheBloke/bloomz-176B-GPTQ | |
# https://huggingface.co/TheBloke/BLOOMChat-176B-v1-GPTQ | |
stub = Stub(name="bloom-chat-v1-q4") # change this to your own name | |
IMAGE_MODEL_DIR = "/model" | |
MODEL_BASE_FILE = "gptq_model-4bit--1g" # the model file name without the ".safetensors" suffix | |
# Bloom models were split to adhere to Hugging Face 50GB limit therefore after downloading, we will merge the model files into a single file. This command works with both versions. | |
SPLIT_FILE_REGEX = "gptq_model-4bit--1g.JOINBEFOREUSE.split-*.safetensors" | |
command = f"cd {IMAGE_MODEL_DIR} && cat {SPLIT_FILE_REGEX} > {MODEL_BASE_FILE}.safetensors && rm {SPLIT_FILE_REGEX}" | |
# Here we declare a function to download the model during the build time. | |
def download_model(): | |
import transformers | |
from huggingface_hub import snapshot_download | |
MODEL_NAME = "TheBloke/BLOOMChat-176B-v1-GPTQ" | |
# Verify at least 200GB is available. | |
# This is to ensure that the model can be downloaded and merged. | |
# The merged file will be deleted after the model is loaded. | |
import shutil | |
total, used, free = shutil.disk_usage("/") | |
assert free > 200 * 1024 * 1024 * 1024, f"Expected at least 200GB free space. Got {free}" | |
# The download may fail once in a while. Simply rerun the script again. | |
print(f"Downloading model... expect 3-15 minutes...") | |
start_time = time.time() | |
snapshot_download(MODEL_NAME, | |
local_dir=IMAGE_MODEL_DIR, | |
resume_download=True, | |
# token is optional but it will speed up the download | |
token="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") | |
end_time = time.time() | |
print(f"Download completed, took => {end_time - start_time:.2f}s") | |
print("Combining model files... expect 3 minutes...") | |
import subprocess | |
subprocess.run(command, check=True, shell=True) | |
print(f"Model files combined, took => {time.time() - end_time:.2f}s") | |
end_time = time.time() | |
# We move cache to avoid doing that during inference time. | |
print("Moving cache... expect 2-4 minutes...") | |
transformers.utils.move_cache() | |
print(f"Cache moved, took => {time.time() - end_time:.2f}s") | |
print("Done! Modal may take up to 15 minutes to upload a snapshot...") | |
inference_image = ( | |
Image.from_dockerhub( | |
"nvidia/cuda:11.8.0-devel-ubuntu22.04", | |
setup_dockerfile_commands=[ | |
"RUN apt-get update", | |
"RUN apt-get install -y python3 python3-pip python-is-python3", | |
], | |
) | |
.apt_install("git", "gcc", "build-essential") | |
.run_commands( | |
"pip install --compile huggingface_hub transformers torch einops hf_transfer", | |
) | |
.env({ | |
"HF_HUB_ENABLE_HF_TRANSFER": "1", # enable fast downloads, this mediates common Hugging Face Read Timeouts | |
"PIP_NO_CACHE_DIR": "1", | |
"PIP_DISABLE_PIP_VERSION_CHECK": "1", | |
"SAFETENSORS_FAST_GPU": "1", # Load the model directly to GPU memory skipping RAM | |
"BITSANDBYTES_NOWELCOME": "1", | |
}) | |
.run_function(download_model) | |
.run_commands( | |
# It appears that installing directly through pip torch extension fails to compile. Therefore we clone it and install it from source whilst providing T4 GPU. | |
"git clone https://github.com/PanQiWei/AutoGPTQ.git", | |
"cd AutoGPTQ && pip install --compile .", | |
gpu="T4", | |
) | |
) | |
api_image = ( | |
Image.debian_slim() | |
) | |
@stub.cls(image=inference_image, gpu=gpu.A100(count=5), container_idle_timeout=300, cloud="oci", concurrency_limit=1) | |
class TheModel: | |
def __enter__(self): | |
start_import = time.time() | |
import torch | |
from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList | |
from auto_gptq import AutoGPTQForCausalLM | |
print(f"importing libraries took => {time.time() - start_import:.2f}s") | |
start_load_tokenizer = time.time() | |
self.tokenizer = AutoTokenizer.from_pretrained( | |
IMAGE_MODEL_DIR, use_fast=True | |
) | |
print(f"loading tokenizer took => {time.time() - start_load_tokenizer:.2f}s") | |
start_loading_model = time.time() | |
print("loading model...") | |
self.model = AutoGPTQForCausalLM.from_quantized( | |
IMAGE_MODEL_DIR, | |
model_basename=MODEL_BASE_FILE, | |
use_safetensors=True, | |
device_map="auto", | |
use_triton=False, | |
strict=True, | |
) | |
self.model.tie_weights() | |
print(f"Model loaded in => {time.time() - start_loading_model:.2f}s") | |
cold_boot_time = time.time() - start_import | |
print(f"total cold boot time => {cold_boot_time:.2f}s") | |
self.is_loaded = False | |
self.cold_boot_time = cold_boot_time | |
@method() | |
async def generate(self, input, temperature = 0.7, max_tokens = 256, stop_words = [""]): | |
import torch | |
from transformers import StoppingCriteria, StoppingCriteriaList | |
# Cold boot time, should be zero if model is already loaded | |
if (self.is_loaded == False): | |
cold_boot_time = self.cold_boot_time | |
self.is_loaded = True | |
else: | |
cold_boot_time = 0 | |
stop_token_ids = self.tokenizer.convert_tokens_to_ids(stop_words) | |
class StopOnTokens(StoppingCriteria): | |
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: | |
for stop_id in stop_token_ids: | |
if input_ids[0][-1] == stop_id: | |
return True | |
return False | |
stopping_criteria = StoppingCriteriaList([StopOnTokens()]) | |
t3 = time.time() | |
input_ids = self.tokenizer(input, return_tensors='pt').input_ids.cuda() | |
input_tokens= len(input_ids[0]) | |
generation = self.model.generate(inputs=input_ids, | |
temperature=temperature, | |
do_sample=True if temperature > 0 else False, | |
max_new_tokens=max_tokens, | |
repetition_penalty=1.1, | |
stopping_criteria=stopping_criteria if len(stop_words) > 0 else None, | |
) | |
completion_tokens = len(generation[0]) - input_tokens | |
# Provide completion without the prompt | |
# Subtract the input tokens from the generated tokens | |
new_tokens = generation[0][input_tokens:] | |
completion = self.tokenizer.decode(new_tokens, skip_special_tokens=True) | |
latency = time.time() - t3 | |
print(f"Input tokens: {input_tokens}") | |
print(f"Completion tokens: {completion_tokens}") | |
print(f"Generation took => {latency:.2f}s") | |
return { | |
"id": str(uuid.uuid4()), | |
"completion": completion, | |
"completion_tokens": completion_tokens, | |
"prompt_tokens": input_tokens, | |
"execution_time": latency, | |
"delay_time": cold_boot_time, | |
"model": stub.name, | |
} | |
DEMO_INPUT = """ | |
<human>: What is Modal? | |
<bot>: Modal (modal.com) lets you run code in the cloud without having to think about infrastructure. | |
Features | |
- Run any code remotely within seconds. | |
- Define container environments in code (or use one of our pre-built backends). | |
- Scale up horizontally to thousands of containers. | |
- Deploy and monitor persistent cron jobs. | |
- Attach GPUs with a single line of code. | |
- Serve your functions as web endpoints. | |
- Use powerful primitives like distributed dictionaries and queues. | |
- Run your code on a schedule. | |
<human>: What is the future of Modal? | |
<bot>: | |
""" | |
@stub.local_entrypoint() | |
def main(): | |
t0 = time.time() | |
model = TheModel() | |
val= model.generate.call(DEMO_INPUT) | |
print(val) | |
print(f"Total time: {time.time() - t0:.2f}s") | |
from pydantic import BaseModel | |
from typing_extensions import Annotated | |
from typing import List, Union | |
class CompletionRequest(BaseModel): | |
prompt: Annotated[str, "The prompt for text completion"] | |
temperature: Annotated[ | |
float, | |
"Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic.", | |
] = 0.7 | |
max_tokens: Annotated[ | |
int, "Maximum number of new tokens to generate for text completion." | |
] = 16 | |
stop_words: Annotated[Union[str, List[str]], "Any additional stop words."] = [] | |
ref: Annotated[str, "Reference string for the completion"] = "" | |
@stub.function(image=api_image, cloud="oci", concurrency_limit=1) | |
@web_endpoint(method="POST") | |
def api(request: CompletionRequest): | |
t = time.time() | |
print(f"Request received: {request.ref}") | |
result = TheModel().generate.call(input=request.prompt, | |
temperature=request.temperature, | |
max_tokens=request.max_tokens, | |
stop_words=request.stop_words | |
) | |
result["ref"] = request.ref | |
print(f"Request completed: {request.ref} => {time.time() - t:.2f}s") | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment