-
-
Save kinoc/2d636a68876cd3de7b6e9c9452b61089 to your computer and use it in GitHub Desktop.
# So you want to run GPT-J-6B using HuggingFace+FastAPI on a local rig (3090 or TITAN) ... tricky. | |
# special help from the Kolob Colab server https://colab.research.google.com/drive/1VFh5DOkCJjWIrQ6eB82lxGKKPgXmsO5D?usp=sharing#scrollTo=iCHgJvfL4alW | |
# Conversion to HF format (12.6GB tar image) found at https://drive.google.com/u/0/uc?id=1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1&export=download | |
# Uses GDOWN to get the image | |
# You will need 26 GB of space, 12+GB for the tar and 12+GB expanded (you can nuke the tar after expansion) | |
# Near Simplest Language model API, with room to expand! | |
# runs GPT-J-6B on 3090 and TITAN and servers it using FastAPI | |
# change "seq" (which is the context size) to adjust footprint | |
# | |
# JAX-based | |
# seq vram usage | |
# 512 14.7G | |
# 900 15.3G | |
# | |
# HF-based | |
# seq vram usage | |
# 512 15.6 G | |
# 900 --.- G | |
# | |
# uses FastAPI, so install that | |
# https://fastapi.tiangolo.com/tutorial/ | |
# pip install fastapi | |
# pip install uvicorn[standard] | |
# pip install git+https://github.com/finetuneanon/transformers@gpt-neo-localattention3 | |
# pip install termcolor | |
# #`pip install flask-ngrok | |
# #`pip install flask_cloudflared | |
# pip install pyngrok | |
# pip install nest-asyncio | |
# pip install gdown | |
# gdown --id 1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1 --output ../j6b_ckpt.tar | |
# (resutls 12.6GB [18:19], 11.4MB/s] | |
# | |
# note: for my setup I needed to perform symlink suggested ny myjr52 in https://github.com/google/jax/issues/5231 | |
# https://pytorch.org/get-started/previous-versions/ | |
# for cuda 10.1 | |
# pip install torch==1.8.1+cu101 torchvision==0.9.1+cu101 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html | |
# for cuda 11.2 | |
# pip install torch==1.8.1+cu112 torchvision==0.9.1+cu112 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html | |
# conda install python-multipart | |
#-------------------------------------- | |
#chek pyngrok — https://github.com/alexdlaird/pyngrok | |
#install | |
# pip install pyngrok | |
# | |
# Set up your ngrok Authtoken | |
# ngrok authtoken xxxxxxxxxxxxx | |
# GO: local execution | |
# XLA_PYTHON_CLIENT_PREALLOCATE=false XLA_PYTHON_CLIENT_ALLOCATOR=platform CUDA_VISIBLE_DEVICES=0 python3 jserv_hf_fast.py | |
# When done try | |
# http://localhost:8051/docs#/default/read_completions_engines_completions_post | |
# now you are in FastAPI + EleutherAI land | |
# note: needs async on the read_completions otherwise jax gets upset | |
# REMEMBER: adjust the location of the checkpoint image TAR_PATH | |
# | |
# Using plain HF instead of Jax so can comment out JAX related for this install | |
# ----------------------------------------- | |
# # uses https://github.com/kingoflolz/mesh-transformer-jax | |
# # so install jax on your system so recommend you get it working with your GPU first | |
# # !apt install zstd | |
# | |
# # | |
# # the "slim" version contain only bf16 weights and no optimizer parameters, which minimizes bandwidth and memory | |
# # wget https://the-eye.eu/public/AI/GPT-J-6B/step_383500_slim.tar.zstd | |
# # tar -I zstd -xf step_383500_slim.tar.zstd | |
# # git clone https://github.com/kingoflolz/mesh-transformer-jax.git | |
# # pip install -r mesh-transformer-jax/requirements.txt | |
# # jax 0.2.12 is required due to a regression with xmap in 0.2.13 | |
# # pip install mesh-transformer-jax/ jax==0.2.12 | |
# # I have cuda 10.1 and python 3.9 so had to update | |
# # pip3 install --upgrade "https://storage.googleapis.com/jax-releases/cuda101/jaxlib-0.1.66+cuda101-cp39-none-manylinux2010_x86_64.whl" | |
# ----------------------------------------- | |
# | |
# Started 2021-06-19 (USA Juneteenth) and released to freedom under MIT | |
# | |
from termcolor import colored | |
#from flask import Flask, redirect, url_for, request | |
import json | |
import torch | |
import requests | |
import subprocess | |
import tarfile | |
import os | |
import re | |
import time | |
from threading import Timer | |
from typing import Optional | |
from typing import Dict | |
from fastapi import FastAPI,Request,Body | |
import uvicorn | |
import nest_asyncio | |
from pyngrok import ngrok | |
import threading | |
import numpy as np | |
import transformers | |
from transformers import GPTNeoForCausalLM, AutoConfig,AutoTokenizer,GPT2Tokenizer | |
print(colored("Server Initialization ...", "magenta")) | |
connect_method = "Ngrok" #@param ["Ngrok", "Cloudflare"] | |
#if connect_method == "Cloudflare": | |
# from flask_cloudflared import run_with_cloudflared | |
#elif connect_method == "Ngrok": | |
# from flask_ngrok import run_with_ngrok | |
model = None | |
tokenizer = None | |
#------------------------------------------ | |
# REMEMBER: Change these settings to local values | |
active_model='' | |
runtime_gpu="cuda:0" | |
training_gpu="cuda:0" | |
TAR_PATH ="../" | |
check_point_dir="../j6b_ckpt" | |
SERVER_PORT = 9995 | |
NGROK_AUTH_TOKEN ="xxxxxxxxx" | |
#----------------------------------------- | |
#https://stackoverflow.com/questions/48152674/how-to-check-if-pytorch-is-using-the-gpu | |
report_color ="green" | |
if (not torch.cuda.is_available()): report_color="red" | |
print(colored(" torch.cuda.is_available() = "+str(torch.cuda.is_available()), report_color)) | |
print(colored(" torch.cuda.current_device() = "+str(torch.cuda.current_device()), report_color)) | |
print(colored(" torch.cuda.device_count() = "+str(torch.cuda.device_count()), report_color)) | |
print(colored(" torch.cuda.get_device_name(0) = "+str(torch.cuda.get_device_name()), report_color)) | |
print(colored(" Mem Allocated:{}GB".format(round(torch.cuda.memory_allocated(0)/1024**3,1)), report_color)) | |
print(colored(" Mem Cached: {}GB".format(round(torch.cuda.memory_reserved(0)/1024**3,1)), report_color)) | |
# Set path to tar file and unpack it | |
model_on_drive = TAR_PATH +"j6b_ckpt.tar" | |
print(colored("Checking j6b_ckpt ...", "magenta")) | |
print(colored(" TAR_PATH ={}".format(TAR_PATH),"green")) | |
print(colored(" check_point_dir ={}".format(check_point_dir),"green")) | |
print(colored(" model_on_drive ={}".format(model_on_drive),"green")) | |
if (not os.path.isdir(check_point_dir)): | |
print(colored("Unpacking tar file, please wait...", "magenta")) | |
tar = tarfile.open(model_on_drive, "r") | |
tar.extractall() | |
tar.close() | |
else: | |
print( colored("Expanded Checkpoint directory found", "green") ) | |
# Initialize the model | |
print(colored("Initializing model, please wait...", "magenta")) | |
config = AutoConfig.from_pretrained("EleutherAI/gpt-neo-2.7B") | |
config.attention_layers = ["global"] * 28 | |
config.attention_types = [["global"], 28] | |
config.num_layers = 28 | |
config.num_heads = 16 | |
config.hidden_size = 256 * config.num_heads | |
config.vocab_size = 50400 | |
config.rotary = True | |
config.rotary_dim = 64 | |
config.jax = True | |
try: | |
from collections.abc import MutableMapping | |
except ImportError: | |
from collections import MutableMapping | |
from pathlib import Path | |
class Checkpoint(MutableMapping): | |
def __init__(self, chkpt_dir, device="cpu"): | |
self.device = device | |
self.chkpt_dir = Path(chkpt_dir) | |
self.checkpoint = torch.load(str(chkpt_dir / Path("m.pt"))) | |
def __len__(self): | |
return len(self.checkpoint) | |
def __getitem__(self, key): | |
path = self.chkpt_dir / Path(self.checkpoint[key]).name | |
return torch.load(str(path), map_location=self.device) | |
def __setitem__(self, key, value): | |
return | |
def __delitem__(self, key, value): | |
return | |
def keys(self): | |
return self.checkpoint.keys() | |
def __iter__(self): | |
for key in self.checkpoint: | |
yield (key, self.__getitem__(key)) | |
def __copy__(self): | |
return Checkpoint(self.chkpt_dir, device=self.device) | |
def copy(self): | |
return Checkpoint(self.chkpt_dir, device=self.device) | |
def infer(context, top_k=40, top_p=0.9, temp=1.0, gen_len=512,repetition_penalty=1): | |
start = time.time() | |
tokens = tokenizer(context, return_tensors="pt").input_ids | |
ids = tokens.cuda() | |
start = time.time() | |
#output = network.generate(batched_tokens, length, gen_len, {"top_p": np.ones(total_batch) * top_p, "temp": np.ones(total_batch) * temp}) | |
output = model.generate(ids, | |
do_sample=True, | |
min_length=gen_len, | |
max_length=gen_len, | |
temperature=temp, | |
use_cache=True, | |
top_p= top_p, | |
repetition_penalty =1.5, | |
no_repeat_ngram_size=6, | |
max_time=60 | |
) | |
samples = [] | |
for i,out_seq in enumerate(output): | |
samples.append(tokenizer.decode(out_seq, skip_special_tokens=True)) | |
#for o in decoded_tokens[:, :, 0]: | |
# samples.append(tokenizer.decode(o)) | |
print(colored(f"completion done in {time.time() - start:06}s","green")) | |
return samples | |
def recursive_infer(initial_context, current_context=None, top_k=40, top_p=0.9, temp=1.0, gen_len=256, depth=0, max_depth=5,recursive_refresh=0,repetition_penalty=1): | |
lcc=0 | |
ic = initial_context | |
cc = '' | |
if current_context : | |
lcc = len(current_context) | |
cc = current_context | |
print (colored("ENTER recursive_infer:{} {} {} {}".format(len(initial_context),lcc,depth,max_depth),"red")) | |
print (colored(" in_cc:{}".format(cc),"cyan")) | |
c='' | |
if not current_context : | |
c = initial_context | |
else: | |
if (recursive_refresh == 1): | |
c= initial_context + "\r\n ... \r\n" | |
c = c + current_context | |
print (colored("loc_c:{}".format(c),"yellow")) | |
loc_len = gen_len + (len(c) / 3) | |
i = infer( c, top_k, top_p, temp, gen_len,repetition_penalty)[0] | |
#yield i[len(c):] | |
#yield i | |
loc_ans = i[len(c):] | |
print (colored(" loc_i:{}".format(i),"white")) | |
print (colored(" loc_ans:{}".format(loc_ans),"white")) | |
if depth >= max_depth: return '' | |
#yield from recursive_infer(initial_context, i[len(c):],top_k, top_p, temp, gen_len, depth+1, max_depth,recursive_refresh,repetition_penalty) | |
recursive_ans = recursive_infer(initial_context, str(loc_ans),top_k, top_p, temp, gen_len, depth+1, max_depth,recursive_refresh,repetition_penalty) | |
returned_ans = str(loc_ans +' '+ recursive_ans) | |
print (colored(" returned_ans:{}".format(returned_ans),"cyan")) | |
print (colored("EXIT recursive_infer:{} {} {} {}".format(len(initial_context),lcc,depth,max_depth),"red")) | |
return returned_ans | |
#model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint()) | |
print(colored("loading GPTNeoForCausalLM.from_pretrained","magenta")) | |
print(colored(" loading from {}".format(check_point_dir),"green")) | |
model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint(check_point_dir)) | |
print(colored("loading GPT2Tokenizer.from_pretrained","magenta")) | |
#tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B") | |
# Initialize the tokenizer and set up the bad_words_ids to exclude Author's Note tags | |
tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
vocab = tokenizer.get_vocab() | |
vocab_keys = vocab.keys() | |
find_keys = lambda char : [key for key in vocab_keys if key.find(char) != -1] | |
bad_words = [] | |
bad_words_ids = [] | |
bad_words.extend(find_keys("[")) | |
bad_words.extend(find_keys(" [")) | |
bad_words.extend(find_keys("<|endoftext|>")) | |
for key in bad_words: | |
bad_id = vocab[key] | |
bad_words_ids.append([bad_id]) | |
print(colored(" move to GPU","magenta")) | |
model.to(runtime_gpu) | |
print(colored(" >>>> DONE! <<<<", "green")) | |
print(colored("PRETEST: warming up processing pipeline","magenta")) | |
#warms up the processing on startup | |
pre_prompt = "I am the EleutherAI / GPT-J-6B based AI language model server. I will" | |
print (colored("PROMPT:"+pre_prompt,"yellow")) | |
print(colored(infer(pre_prompt)[0],"cyan")) | |
# app = Flask(__name__) | |
app = FastAPI() | |
#if connect_method == "Cloudflare": | |
# run_with_cloudflared(app) | |
#elif connect_method == "Ngrok": | |
# run_with_ngrok(app) | |
@app.route("/") | |
def home(): | |
return "<h1>EleutherAI J6B Service Running!</h1>" | |
@app.route('/request',methods = ['POST']) | |
def koboldrequest(request: Request=None): | |
if request.method == 'POST': | |
try: | |
#clear_output() | |
js = request.json | |
txt = js["text"] | |
min = js["min"] | |
max = js["max"] | |
rep_pen = js["rep_pen"] | |
temp = js["temperature"] | |
top_p = js["top_p"] | |
# Compatability with un-updated clients | |
if("numseqs" in js): | |
numseqs = js["numseqs"] | |
else: | |
numseqs = 1 | |
if("retfultxt" in js): | |
retfultxt = js["retfultxt"] | |
else: | |
retfultxt = True | |
print(colored("Received Data: {0}".format(txt), "yellow")) | |
torch.cuda.empty_cache() | |
print(colored("Generating text, please wait...", "green")) | |
tokens = tokenizer(txt, return_tensors="pt").input_ids.to("cpu") | |
ids = tokens.cuda() | |
gen_tokens = model.generate( | |
ids.long().cuda(), | |
do_sample=True, | |
min_length=min, | |
max_length=max, | |
temperature=temp, | |
top_p = top_p, | |
repetition_penalty = rep_pen, | |
use_cache=True, | |
bad_words_ids=bad_words_ids, | |
num_return_sequences=numseqs | |
).long() | |
genout = [] | |
for tkns in gen_tokens: | |
if(not retfultxt): | |
# Strip context tokens out of returned sequences | |
dif = (len(tkns) - len(tokens[0])) * -1 | |
tkns = tkns[dif:] | |
tkns = list(filter(lambda a: a != 50256, tkns)) | |
genout.append(tokenizer.decode(tkns)) | |
torch.cuda.empty_cache() | |
if(len(genout) > 0 and genout[0] != ""): | |
if(retfultxt): | |
# Outdated client, send old JSON format | |
print(colored("Generated Text: {0}".format(genout[0]), "cyan")) | |
response = app.response_class( | |
response=json.dumps({"data": {"text": genout[0]}}), | |
status=200, | |
mimetype='application/json' | |
) | |
else: | |
# New client format with numseq support | |
i = 0 | |
for seq in genout: | |
print(colored("[Result {0}]\n{1}".format(i, seq), "cyan")) | |
i += 1 | |
response = app.response_class( | |
response=json.dumps({"data": {"seqs": genout}}), | |
status=200, | |
mimetype='application/json' | |
) | |
return response | |
else: | |
print(colored("[ERROR] Something went wrong during generation!", "red")) | |
response = app.response_class( | |
response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation!"}}}), | |
status=400, | |
mimetype='application/json' | |
) | |
js = {} | |
tokens = [] | |
ids = [] | |
gen_tokens = [] | |
genout = "" | |
response = {} | |
except Exception as e: | |
print(colored("[ERROR] Something went wrong during generation!", "red")) | |
print(colored("{0}".format(e), "red")) | |
response = app.response_class( | |
response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation! {0}".format(e)}}}), | |
status=400, | |
mimetype='application/json' | |
) | |
@app.post("/engines/completions") | |
async def read_completions( | |
#engine_id:str, | |
prompt:Optional[str] = None, | |
max_tokens: Optional[int]=16, | |
temperature: Optional[float]=1.0, | |
top_p:Optional[float]=1.0, | |
top_k:Optional[int]=40, | |
n:Optional[int]=1, | |
stream:Optional[bool]=False, | |
logprobs:Optional[int]=None, | |
echo:Optional[bool]=False, | |
stop:Optional[list]=None, | |
presence_penalty:Optional[float]=0.0001, | |
repetition_penalty:Optional[float]=1.0000, | |
best_of:Optional[int]=1, | |
recursive_depth:Optional[int]=0, | |
recursive_refresh:Optional[int]=0, | |
logit_bias:Optional[Dict[str,float]]=None, | |
request: Request=None | |
): | |
global active_model,model,tokenizer | |
response={} | |
response['params']= dict(request.query_params) | |
print(response) | |
text = str(prompt) | |
text = text.replace("|","\r\n") | |
prompt_len = len(text) | |
ids = tokenizer(text, return_tensors="pt").input_ids.to(runtime_gpu) | |
max_length = max_tokens + ids.shape[1] | |
do_sample=True | |
use_cache=True | |
start = time.time() | |
num_return_sequences=n | |
num_beams = n | |
num_beam_groups=n | |
if (recursive_depth== 0): | |
gen_tokens = model.generate( | |
ids, | |
do_sample=True, | |
min_length=max_length, | |
max_length=max_length, | |
temperature=temperature, | |
use_cache=True, | |
num_beams = num_beams, | |
num_return_sequences=num_return_sequences, | |
# num_beam_groups=num_beam_groups, | |
# early_stopping=True, | |
top_p=top_p, | |
# top_k=50, | |
repetition_penalty =repetition_penalty, | |
no_repeat_ngram_size=6, | |
max_time=60 | |
) | |
else: | |
gen_tokens = [] | |
# do it serial until we figure out parallel for recursive | |
for x in range(num_return_sequences): | |
ref_text = str(text) | |
gen_tokens.append( recursive_infer(initial_context=str(ref_text), | |
current_context=None, | |
top_p=top_p,top_k=top_k, temp=temperature, | |
gen_len=max_length, | |
depth=0, | |
max_depth = recursive_depth, | |
recursive_refresh=recursive_refresh, | |
repetition_penalty=repetition_penalty | |
)) | |
last_prompt=text | |
choices=[] | |
gen_text='' | |
for i,out_seq in enumerate(gen_tokens): | |
choice={} | |
choice['prompt']=last_prompt | |
if (recursive_depth== 0): | |
choice['text']=tokenizer.decode(out_seq, skip_special_tokens=True) | |
else: | |
choice['text']=out_seq | |
choice['index']=i | |
choice['logprobs']=None | |
choice['finish_reason']='length' | |
choices.append(choice) | |
print("GenText[{}]:{}".format(i,choice['text'])) | |
gen_text = gen_text + choice['text'] | |
if (recursive_depth==0): | |
last_prompt = text | |
else: | |
last_prompt = text | |
#last_prompt = out_seq | |
#if (recursive_refresh==1): | |
# last_prompt = text +"\r\n ... \r\n"+out_seq | |
#gen_text = tokenizer.batch_decode(gen_tokens)[0] | |
fin = time.time() | |
elapsed = fin - start | |
cps = (len(gen_text)-prompt_len) / elapsed | |
print("elapsed:{} len:{} cps:{}".format(elapsed,len(gen_text),cps)) | |
response['id']='' | |
response['object']='text_completion' | |
response['created']='' | |
response['model']= 'GPT-J-6B_HF' #args.model | |
response['choices']=choices | |
return(response) | |
print(colored("Model startup complete! Starting web service....", "green")) | |
# Setting an auth token allows us to open multiple | |
# tunnels at the same time | |
if (NGROK_AUTH_TOKEN is not None) and not ("xxxxxx" in NGROK_AUTH_TOKEN ) : | |
ngrok.set_auth_token(NGROK_AUTH_TOKEN) | |
public_url = ngrok.connect(SERVER_PORT) | |
print(colored("Public_URL = "+str(public_url), "cyan")) | |
nest_asyncio.apply() | |
#app.run() | |
#if __name__ == "__main__": | |
print(colored("Ready to Serve!", "green")) | |
uvicorn.run(app, host="0.0.0.0", port=SERVER_PORT) | |
print (colored("Happy Service!", "green")) | |
# http://localhost:9995/docs#/default/read_completions_engines_completions_post | |
# http://<NGROK_URL_ID>.ngrok.io/docs#/default/read_completions_engines_completions_post | |
# http://<NGROK_URL_ID>.ngrok.io/docs#/default/koboldrequest_request_post | |
awesome!!!
loading from ../j6b_ckpt
Traceback (most recent call last):
File "jserv-hf.py", line 287, in
model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint(check_point_dir))
File "jserv-hf.py", line 201, in init
self.checkpoint = torch.load(str(chkpt_dir / Path("m.pt")))
File "/home/jp/miniconda3/envs/torch/lib/python3.8/site-packages/torch/serialization.py", line 594, in load
with _open_file_like(f, 'rb') as opened_file:
File "/home/jp/miniconda3/envs/torch/lib/python3.8/site-packages/torch/serialization.py", line 230, in _open_file_like
return _open_file(name_or_buffer, mode)
File "/home/jp/miniconda3/envs/torch/lib/python3.8/site-packages/torch/serialization.py", line 211, in init
super(_open_file, self).init(open(name, mode))
FileNotFoundError: [Errno 2] No such file or directory: '../j6b_ckpt/m.pt'
Needed to use
@app.api_route
for some reason