Skip to content

Instantly share code, notes, and snippets.

@kinoc
Created July 7, 2022 02:45
Show Gist options
  • Save kinoc/4561a226095c7f1a37b094dd02e74539 to your computer and use it in GitHub Desktop.
Save kinoc/4561a226095c7f1a37b094dd02e74539 to your computer and use it in GitHub Desktop.
Running GPT-J-6B (semi-)direct from HuggingFace on a 3080 or >=14GB VRAM GPU
# Running GPT-J-6B on a 3080 (semi-)direct from HuggingFace
#
# My laptop card has 15458 MiB free, but has problems loading the model directly from HF.
# This code fixes it using the "no_init" method by @kurumuz
# found in https://discord.com/channels/729741769192767510/851918317039255592/916463267264725083
# it is both loads faster and fixes the intermediate out-of-memory
# I now have 2564 MiB left over
#
# BEFORE LOADING
# index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
# 0, NVIDIA GeForce RTX 3080 Laptop GPU, 471.41, 16384 MiB, 926 MiB, 15458 MiB
#
# AFTER LOADING
# index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
# 0, NVIDIA GeForce RTX 3080 Laptop GPU, 471.41, 16384 MiB, 13820 MiB, 2564 MiB
# Conda installs :
# conda install transformers
# conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
from transformers import AutoTokenizer,AutoModelForCausalLM
import torch
import sys
print('__Python VERSION:', sys.version)
print('__pyTorch VERSION:', torch.__version__)
print('__CUDA VERSION', )
from subprocess import call
# call(["nvcc", "--version"]) #does not work
#! nvcc --version
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
print('__Devices')
print()
print("BEFORE")
call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
print()
print('Active CUDA Device: GPU', torch.cuda.current_device())
print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())
print("Start GPT-j-6B")
if torch.cuda.is_available():
print("Using GPU")
device = torch.device("cuda")
else:
print("Using CPU")
device = torch.device("cpu")
def no_init(loading_code):
def dummy(self):
return
modules = [torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm]
original = {}
for mod in modules:
original[mod] = mod.reset_parameters
mod.reset_parameters = dummy
result = loading_code()
for mod in modules:
mod.reset_parameters = original[mod]
return result
print(" Load Model")
#model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
model = no_init(lambda: AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision='float16', torch_dtype=torch.float16, low_cpu_mem_usage=True)) #.to(device, torch_dtype=torch.float16)
print(" Move Model")
model.to(device)
print(" Load Tokenizer")
#tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16")
print(" Tokenize and Generate")
prompt = "Faced with the problem of universal happiness, the great oracle GPT-J-6B spoke wisely when it said:'"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
generated_ids = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=200)
generated_text = tokenizer.decode(generated_ids[0])
print()
print("Prompt:",prompt)
print("Generated:",generated_text)
print("Done J-6B")
print()
print("AFTER")
call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment