-
-
Save Metawhy/2de021a8310fa13f097cb5ee79f3f103 to your computer and use it in GitHub Desktop.
Run HuggingFace converted GPT-J-6B checkpoint using FastAPI and Ngrok on local GPU (3090 or Titan)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# So you want to run GPT-J-6B using HuggingFace+FastAPI on a local rig (3090 or TITAN) ... tricky. | |
# special help from the Kolob Colab server https://colab.research.google.com/drive/1VFh5DOkCJjWIrQ6eB82lxGKKPgXmsO5D?usp=sharing#scrollTo=iCHgJvfL4alW | |
# Conversion to HF format (12.6GB tar image) found at https://drive.google.com/u/0/uc?id=1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1&export=download | |
# Uses GDOWN to get the image | |
# You will need 26 GB of space, 12+GB for the tar and 12+GB expanded (you can nuke the tar after expansion) | |
# Near Simplest Language model API, with room to expand! | |
# runs GPT-J-6B on 3090 and TITAN and servers it using FastAPI | |
# change "seq" (which is the context size) to adjust footprint | |
# | |
# JAX-based | |
# seq vram usage | |
# 512 14.7G | |
# 900 15.3G | |
# | |
# HF-based | |
# seq vram usage | |
# 512 15.6 G | |
# 900 --.- G | |
# | |
# uses FastAPI, so install that | |
# https://fastapi.tiangolo.com/tutorial/ | |
# pip install fastapi | |
# pip install uvicorn[standard] | |
# pip install git+https://github.com/finetuneanon/transformers@gpt-neo-localattention3 | |
# pip install termcolor | |
# #`pip install flask-ngrok | |
# #`pip install flask_cloudflared | |
# pip install pyngrok | |
# pip install nest-asyncio | |
# pip install gdown | |
# gdown --id 1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1 --output ../j6b_ckpt.tar | |
# (resutls 12.6GB [18:19], 11.4MB/s] | |
# | |
# note: for my setup I needed to perform symlink suggested ny myjr52 in https://github.com/google/jax/issues/5231 | |
# https://pytorch.org/get-started/previous-versions/ | |
# for cuda 10.1 | |
# pip install torch==1.8.1+cu101 torchvision==0.9.1+cu101 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html | |
# for cuda 11.2 | |
# pip install torch==1.8.1+cu112 torchvision==0.9.1+cu112 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html | |
# conda install python-multipart | |
#-------------------------------------- | |
#chek pyngrok — https://github.com/alexdlaird/pyngrok | |
#install | |
# pip install pyngrok | |
# | |
# Set up your ngrok Authtoken | |
# ngrok authtoken xxxxxxxxxxxxx | |
# GO: local execution | |
# XLA_PYTHON_CLIENT_PREALLOCATE=false XLA_PYTHON_CLIENT_ALLOCATOR=platform CUDA_VISIBLE_DEVICES=0 python3 jserv_hf_fast.py | |
# When done try | |
# http://localhost:8051/docs#/default/read_completions_engines_completions_post | |
# now you are in FastAPI + EleutherAI land | |
# note: needs async on the read_completions otherwise jax gets upset | |
# REMEMBER: adjust the location of the checkpoint image TAR_PATH | |
# | |
# Using plain HF instead of Jax so can comment out JAX related for this install | |
# ----------------------------------------- | |
# # uses https://github.com/kingoflolz/mesh-transformer-jax | |
# # so install jax on your system so recommend you get it working with your GPU first | |
# # !apt install zstd | |
# | |
# # | |
# # the "slim" version contain only bf16 weights and no optimizer parameters, which minimizes bandwidth and memory | |
# # wget https://the-eye.eu/public/AI/GPT-J-6B/step_383500_slim.tar.zstd | |
# # tar -I zstd -xf step_383500_slim.tar.zstd | |
# # git clone https://github.com/kingoflolz/mesh-transformer-jax.git | |
# # pip install -r mesh-transformer-jax/requirements.txt | |
# # jax 0.2.12 is required due to a regression with xmap in 0.2.13 | |
# # pip install mesh-transformer-jax/ jax==0.2.12 | |
# # I have cuda 10.1 and python 3.9 so had to update | |
# # pip3 install --upgrade "https://storage.googleapis.com/jax-releases/cuda101/jaxlib-0.1.66+cuda101-cp39-none-manylinux2010_x86_64.whl" | |
# ----------------------------------------- | |
# | |
# Started 2021-06-19 (USA Juneteenth) and released to freedom under MIT | |
# | |
from termcolor import colored | |
#from flask import Flask, redirect, url_for, request | |
import json | |
import torch | |
import requests | |
import subprocess | |
import tarfile | |
import os | |
import re | |
import time | |
from threading import Timer | |
from typing import Optional | |
from typing import Dict | |
from fastapi import FastAPI,Request,Body | |
import uvicorn | |
import nest_asyncio | |
from pyngrok import ngrok | |
import threading | |
import numpy as np | |
import transformers | |
from transformers import GPTNeoForCausalLM, AutoConfig,AutoTokenizer,GPT2Tokenizer | |
print(colored("Server Initialization ...", "magenta")) | |
connect_method = "Ngrok" #@param ["Ngrok", "Cloudflare"] | |
#if connect_method == "Cloudflare": | |
# from flask_cloudflared import run_with_cloudflared | |
#elif connect_method == "Ngrok": | |
# from flask_ngrok import run_with_ngrok | |
model = None | |
tokenizer = None | |
#------------------------------------------ | |
# REMEMBER: Change these settings to local values | |
active_model='' | |
runtime_gpu="cuda:0" | |
training_gpu="cuda:0" | |
TAR_PATH ="../" | |
check_point_dir="../j6b_ckpt" | |
SERVER_PORT = 9995 | |
NGROK_AUTH_TOKEN ="xxxxxxxxx" | |
#----------------------------------------- | |
#https://stackoverflow.com/questions/48152674/how-to-check-if-pytorch-is-using-the-gpu | |
report_color ="green" | |
if (not torch.cuda.is_available()): report_color="red" | |
print(colored(" torch.cuda.is_available() = "+str(torch.cuda.is_available()), report_color)) | |
print(colored(" torch.cuda.current_device() = "+str(torch.cuda.current_device()), report_color)) | |
print(colored(" torch.cuda.device_count() = "+str(torch.cuda.device_count()), report_color)) | |
print(colored(" torch.cuda.get_device_name(0) = "+str(torch.cuda.get_device_name()), report_color)) | |
print(colored(" Mem Allocated:{}GB".format(round(torch.cuda.memory_allocated(0)/1024**3,1)), report_color)) | |
print(colored(" Mem Cached: {}GB".format(round(torch.cuda.memory_reserved(0)/1024**3,1)), report_color)) | |
# Set path to tar file and unpack it | |
model_on_drive = TAR_PATH +"j6b_ckpt.tar" | |
print(colored("Checking j6b_ckpt ...", "magenta")) | |
print(colored(" TAR_PATH ={}".format(TAR_PATH),"green")) | |
print(colored(" check_point_dir ={}".format(check_point_dir),"green")) | |
print(colored(" model_on_drive ={}".format(model_on_drive),"green")) | |
if (not os.path.isdir(check_point_dir)): | |
print(colored("Unpacking tar file, please wait...", "magenta")) | |
tar = tarfile.open(model_on_drive, "r") | |
tar.extractall() | |
tar.close() | |
else: | |
print( colored("Expanded Checkpoint directory found", "green") ) | |
# Initialize the model | |
print(colored("Initializing model, please wait...", "magenta")) | |
config = AutoConfig.from_pretrained("EleutherAI/gpt-neo-2.7B") | |
config.attention_layers = ["global"] * 28 | |
config.attention_types = [["global"], 28] | |
config.num_layers = 28 | |
config.num_heads = 16 | |
config.hidden_size = 256 * config.num_heads | |
config.vocab_size = 50400 | |
config.rotary = True | |
config.rotary_dim = 64 | |
config.jax = True | |
try: | |
from collections.abc import MutableMapping | |
except ImportError: | |
from collections import MutableMapping | |
from pathlib import Path | |
class Checkpoint(MutableMapping): | |
def __init__(self, chkpt_dir, device="cpu"): | |
self.device = device | |
self.chkpt_dir = Path(chkpt_dir) | |
self.checkpoint = torch.load(str(chkpt_dir / Path("m.pt"))) | |
def __len__(self): | |
return len(self.checkpoint) | |
def __getitem__(self, key): | |
path = self.chkpt_dir / Path(self.checkpoint[key]).name | |
return torch.load(str(path), map_location=self.device) | |
def __setitem__(self, key, value): | |
return | |
def __delitem__(self, key, value): | |
return | |
def keys(self): | |
return self.checkpoint.keys() | |
def __iter__(self): | |
for key in self.checkpoint: | |
yield (key, self.__getitem__(key)) | |
def __copy__(self): | |
return Checkpoint(self.chkpt_dir, device=self.device) | |
def copy(self): | |
return Checkpoint(self.chkpt_dir, device=self.device) | |
def infer(context, top_k=40, top_p=0.9, temp=1.0, gen_len=512,repetition_penalty=1): | |
start = time.time() | |
tokens = tokenizer(context, return_tensors="pt").input_ids | |
ids = tokens.cuda() | |
start = time.time() | |
#output = network.generate(batched_tokens, length, gen_len, {"top_p": np.ones(total_batch) * top_p, "temp": np.ones(total_batch) * temp}) | |
output = model.generate(ids, | |
do_sample=True, | |
min_length=gen_len, | |
max_length=gen_len, | |
temperature=temp, | |
use_cache=True, | |
top_p= top_p, | |
repetition_penalty =1.5, | |
no_repeat_ngram_size=6, | |
max_time=60 | |
) | |
samples = [] | |
for i,out_seq in enumerate(output): | |
samples.append(tokenizer.decode(out_seq, skip_special_tokens=True)) | |
#for o in decoded_tokens[:, :, 0]: | |
# samples.append(tokenizer.decode(o)) | |
print(colored(f"completion done in {time.time() - start:06}s","green")) | |
return samples | |
def recursive_infer(initial_context, current_context=None, top_k=40, top_p=0.9, temp=1.0, gen_len=256, depth=0, max_depth=5,recursive_refresh=0,repetition_penalty=1): | |
lcc=0 | |
ic = initial_context | |
cc = '' | |
if current_context : | |
lcc = len(current_context) | |
cc = current_context | |
print (colored("ENTER recursive_infer:{} {} {} {}".format(len(initial_context),lcc,depth,max_depth),"red")) | |
print (colored(" in_cc:{}".format(cc),"cyan")) | |
c='' | |
if not current_context : | |
c = initial_context | |
else: | |
if (recursive_refresh == 1): | |
c= initial_context + "\r\n ... \r\n" | |
c = c + current_context | |
print (colored("loc_c:{}".format(c),"yellow")) | |
loc_len = gen_len + (len(c) / 3) | |
i = infer( c, top_k, top_p, temp, gen_len,repetition_penalty)[0] | |
#yield i[len(c):] | |
#yield i | |
loc_ans = i[len(c):] | |
print (colored(" loc_i:{}".format(i),"white")) | |
print (colored(" loc_ans:{}".format(loc_ans),"white")) | |
if depth >= max_depth: return '' | |
#yield from recursive_infer(initial_context, i[len(c):],top_k, top_p, temp, gen_len, depth+1, max_depth,recursive_refresh,repetition_penalty) | |
recursive_ans = recursive_infer(initial_context, str(loc_ans),top_k, top_p, temp, gen_len, depth+1, max_depth,recursive_refresh,repetition_penalty) | |
returned_ans = str(loc_ans +' '+ recursive_ans) | |
print (colored(" returned_ans:{}".format(returned_ans),"cyan")) | |
print (colored("EXIT recursive_infer:{} {} {} {}".format(len(initial_context),lcc,depth,max_depth),"red")) | |
return returned_ans | |
#model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint()) | |
print(colored("loading GPTNeoForCausalLM.from_pretrained","magenta")) | |
print(colored(" loading from {}".format(check_point_dir),"green")) | |
model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint(check_point_dir)) | |
print(colored("loading GPT2Tokenizer.from_pretrained","magenta")) | |
#tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B") | |
# Initialize the tokenizer and set up the bad_words_ids to exclude Author's Note tags | |
tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
vocab = tokenizer.get_vocab() | |
vocab_keys = vocab.keys() | |
find_keys = lambda char : [key for key in vocab_keys if key.find(char) != -1] | |
bad_words = [] | |
bad_words_ids = [] | |
bad_words.extend(find_keys("[")) | |
bad_words.extend(find_keys(" [")) | |
bad_words.extend(find_keys("<|endoftext|>")) | |
for key in bad_words: | |
bad_id = vocab[key] | |
bad_words_ids.append([bad_id]) | |
print(colored(" move to GPU","magenta")) | |
model.to(runtime_gpu) | |
print(colored(" >>>> DONE! <<<<", "green")) | |
print(colored("PRETEST: warming up processing pipeline","magenta")) | |
#warms up the processing on startup | |
pre_prompt = "I am the EleutherAI / GPT-J-6B based AI language model server. I will" | |
print (colored("PROMPT:"+pre_prompt,"yellow")) | |
print(colored(infer(pre_prompt)[0],"cyan")) | |
# app = Flask(__name__) | |
app = FastAPI() | |
#if connect_method == "Cloudflare": | |
# run_with_cloudflared(app) | |
#elif connect_method == "Ngrok": | |
# run_with_ngrok(app) | |
@app.route("/") | |
def home(): | |
return "<h1>EleutherAI J6B Service Running!</h1>" | |
@app.route('/request',methods = ['POST']) | |
def koboldrequest(request: Request=None): | |
if request.method == 'POST': | |
try: | |
#clear_output() | |
js = request.json | |
txt = js["text"] | |
min = js["min"] | |
max = js["max"] | |
rep_pen = js["rep_pen"] | |
temp = js["temperature"] | |
top_p = js["top_p"] | |
# Compatability with un-updated clients | |
if("numseqs" in js): | |
numseqs = js["numseqs"] | |
else: | |
numseqs = 1 | |
if("retfultxt" in js): | |
retfultxt = js["retfultxt"] | |
else: | |
retfultxt = True | |
print(colored("Received Data: {0}".format(txt), "yellow")) | |
torch.cuda.empty_cache() | |
print(colored("Generating text, please wait...", "green")) | |
tokens = tokenizer(txt, return_tensors="pt").input_ids.to("cpu") | |
ids = tokens.cuda() | |
gen_tokens = model.generate( | |
ids.long().cuda(), | |
do_sample=True, | |
min_length=min, | |
max_length=max, | |
temperature=temp, | |
top_p = top_p, | |
repetition_penalty = rep_pen, | |
use_cache=True, | |
bad_words_ids=bad_words_ids, | |
num_return_sequences=numseqs | |
).long() | |
genout = [] | |
for tkns in gen_tokens: | |
if(not retfultxt): | |
# Strip context tokens out of returned sequences | |
dif = (len(tkns) - len(tokens[0])) * -1 | |
tkns = tkns[dif:] | |
tkns = list(filter(lambda a: a != 50256, tkns)) | |
genout.append(tokenizer.decode(tkns)) | |
torch.cuda.empty_cache() | |
if(len(genout) > 0 and genout[0] != ""): | |
if(retfultxt): | |
# Outdated client, send old JSON format | |
print(colored("Generated Text: {0}".format(genout[0]), "cyan")) | |
response = app.response_class( | |
response=json.dumps({"data": {"text": genout[0]}}), | |
status=200, | |
mimetype='application/json' | |
) | |
else: | |
# New client format with numseq support | |
i = 0 | |
for seq in genout: | |
print(colored("[Result {0}]\n{1}".format(i, seq), "cyan")) | |
i += 1 | |
response = app.response_class( | |
response=json.dumps({"data": {"seqs": genout}}), | |
status=200, | |
mimetype='application/json' | |
) | |
return response | |
else: | |
print(colored("[ERROR] Something went wrong during generation!", "red")) | |
response = app.response_class( | |
response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation!"}}}), | |
status=400, | |
mimetype='application/json' | |
) | |
js = {} | |
tokens = [] | |
ids = [] | |
gen_tokens = [] | |
genout = "" | |
response = {} | |
except Exception as e: | |
print(colored("[ERROR] Something went wrong during generation!", "red")) | |
print(colored("{0}".format(e), "red")) | |
response = app.response_class( | |
response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation! {0}".format(e)}}}), | |
status=400, | |
mimetype='application/json' | |
) | |
@app.post("/engines/completions") | |
async def read_completions( | |
#engine_id:str, | |
prompt:Optional[str] = None, | |
max_tokens: Optional[int]=16, | |
temperature: Optional[float]=1.0, | |
top_p:Optional[float]=1.0, | |
top_k:Optional[int]=40, | |
n:Optional[int]=1, | |
stream:Optional[bool]=False, | |
logprobs:Optional[int]=None, | |
echo:Optional[bool]=False, | |
stop:Optional[list]=None, | |
presence_penalty:Optional[float]=0.0001, | |
repetition_penalty:Optional[float]=1.0000, | |
best_of:Optional[int]=1, | |
recursive_depth:Optional[int]=0, | |
recursive_refresh:Optional[int]=0, | |
logit_bias:Optional[Dict[str,float]]=None, | |
request: Request=None | |
): | |
global active_model,model,tokenizer | |
response={} | |
response['params']= dict(request.query_params) | |
print(response) | |
text = str(prompt) | |
text = text.replace("|","\r\n") | |
prompt_len = len(text) | |
ids = tokenizer(text, return_tensors="pt").input_ids.to(runtime_gpu) | |
max_length = max_tokens + ids.shape[1] | |
do_sample=True | |
use_cache=True | |
start = time.time() | |
num_return_sequences=n | |
num_beams = n | |
num_beam_groups=n | |
if (recursive_depth== 0): | |
gen_tokens = model.generate( | |
ids, | |
do_sample=True, | |
min_length=max_length, | |
max_length=max_length, | |
temperature=temperature, | |
use_cache=True, | |
num_beams = num_beams, | |
num_return_sequences=num_return_sequences, | |
# num_beam_groups=num_beam_groups, | |
# early_stopping=True, | |
top_p=top_p, | |
# top_k=50, | |
repetition_penalty =repetition_penalty, | |
no_repeat_ngram_size=6, | |
max_time=60 | |
) | |
else: | |
gen_tokens = [] | |
# do it serial until we figure out parallel for recursive | |
for x in range(num_return_sequences): | |
ref_text = str(text) | |
gen_tokens.append( recursive_infer(initial_context=str(ref_text), | |
current_context=None, | |
top_p=top_p,top_k=top_k, temp=temperature, | |
gen_len=max_length, | |
depth=0, | |
max_depth = recursive_depth, | |
recursive_refresh=recursive_refresh, | |
repetition_penalty=repetition_penalty | |
)) | |
last_prompt=text | |
choices=[] | |
gen_text='' | |
for i,out_seq in enumerate(gen_tokens): | |
choice={} | |
choice['prompt']=last_prompt | |
if (recursive_depth== 0): | |
choice['text']=tokenizer.decode(out_seq, skip_special_tokens=True) | |
else: | |
choice['text']=out_seq | |
choice['index']=i | |
choice['logprobs']=None | |
choice['finish_reason']='length' | |
choices.append(choice) | |
print("GenText[{}]:{}".format(i,choice['text'])) | |
gen_text = gen_text + choice['text'] | |
if (recursive_depth==0): | |
last_prompt = text | |
else: | |
last_prompt = text | |
#last_prompt = out_seq | |
#if (recursive_refresh==1): | |
# last_prompt = text +"\r\n ... \r\n"+out_seq | |
#gen_text = tokenizer.batch_decode(gen_tokens)[0] | |
fin = time.time() | |
elapsed = fin - start | |
cps = (len(gen_text)-prompt_len) / elapsed | |
print("elapsed:{} len:{} cps:{}".format(elapsed,len(gen_text),cps)) | |
response['id']='' | |
response['object']='text_completion' | |
response['created']='' | |
response['model']= 'GPT-J-6B_HF' #args.model | |
response['choices']=choices | |
return(response) | |
print(colored("Model startup complete! Starting web service....", "green")) | |
# Setting an auth token allows us to open multiple | |
# tunnels at the same time | |
if (NGROK_AUTH_TOKEN is not None) and not ("xxxxxx" in NGROK_AUTH_TOKEN ) : | |
ngrok.set_auth_token(NGROK_AUTH_TOKEN) | |
public_url = ngrok.connect(SERVER_PORT) | |
print(colored("Public_URL = "+str(public_url), "cyan")) | |
nest_asyncio.apply() | |
#app.run() | |
#if __name__ == "__main__": | |
print(colored("Ready to Serve!", "green")) | |
uvicorn.run(app, host="0.0.0.0", port=SERVER_PORT) | |
print (colored("Happy Service!", "green")) | |
# http://localhost:9995/docs#/default/read_completions_engines_completions_post | |
# http://<NGROK_URL_ID>.ngrok.io/docs#/default/read_completions_engines_completions_post | |
# http://<NGROK_URL_ID>.ngrok.io/docs#/default/koboldrequest_request_post | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment