Created
July 7, 2022 02:45
-
-
Save kinoc/4561a226095c7f1a37b094dd02e74539 to your computer and use it in GitHub Desktop.
Running GPT-J-6B (semi-)direct from HuggingFace on a 3080 or >=14GB VRAM GPU
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Running GPT-J-6B on a 3080 (semi-)direct from HuggingFace | |
# | |
# My laptop card has 15458 MiB free, but has problems loading the model directly from HF. | |
# This code fixes it using the "no_init" method by @kurumuz | |
# found in https://discord.com/channels/729741769192767510/851918317039255592/916463267264725083 | |
# it is both loads faster and fixes the intermediate out-of-memory | |
# I now have 2564 MiB left over | |
# | |
# BEFORE LOADING | |
# index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB] | |
# 0, NVIDIA GeForce RTX 3080 Laptop GPU, 471.41, 16384 MiB, 926 MiB, 15458 MiB | |
# | |
# AFTER LOADING | |
# index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB] | |
# 0, NVIDIA GeForce RTX 3080 Laptop GPU, 471.41, 16384 MiB, 13820 MiB, 2564 MiB | |
# Conda installs : | |
# conda install transformers | |
# conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch | |
from transformers import AutoTokenizer,AutoModelForCausalLM | |
import torch | |
import sys | |
print('__Python VERSION:', sys.version) | |
print('__pyTorch VERSION:', torch.__version__) | |
print('__CUDA VERSION', ) | |
from subprocess import call | |
# call(["nvcc", "--version"]) #does not work | |
#! nvcc --version | |
print('__CUDNN VERSION:', torch.backends.cudnn.version()) | |
print('__Number CUDA Devices:', torch.cuda.device_count()) | |
print('__Devices') | |
print() | |
print("BEFORE") | |
call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"]) | |
print() | |
print('Active CUDA Device: GPU', torch.cuda.current_device()) | |
print ('Available devices ', torch.cuda.device_count()) | |
print ('Current cuda device ', torch.cuda.current_device()) | |
print("Start GPT-j-6B") | |
if torch.cuda.is_available(): | |
print("Using GPU") | |
device = torch.device("cuda") | |
else: | |
print("Using CPU") | |
device = torch.device("cpu") | |
def no_init(loading_code): | |
def dummy(self): | |
return | |
modules = [torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm] | |
original = {} | |
for mod in modules: | |
original[mod] = mod.reset_parameters | |
mod.reset_parameters = dummy | |
result = loading_code() | |
for mod in modules: | |
mod.reset_parameters = original[mod] | |
return result | |
print(" Load Model") | |
#model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B") | |
model = no_init(lambda: AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision='float16', torch_dtype=torch.float16, low_cpu_mem_usage=True)) #.to(device, torch_dtype=torch.float16) | |
print(" Move Model") | |
model.to(device) | |
print(" Load Tokenizer") | |
#tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") | |
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16") | |
print(" Tokenize and Generate") | |
prompt = "Faced with the problem of universal happiness, the great oracle GPT-J-6B spoke wisely when it said:'" | |
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) | |
generated_ids = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=200) | |
generated_text = tokenizer.decode(generated_ids[0]) | |
print() | |
print("Prompt:",prompt) | |
print("Generated:",generated_text) | |
print("Done J-6B") | |
print() | |
print("AFTER") | |
call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"]) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment