kinoc · July 7, 2022 02:45
diff --git a/HF-GPT-J-6B-3080.py b/HF-GPT-J-6B-3080.py
 # Running GPT-J-6B on a 3080 (semi-)direct from HuggingFace
 # 
 # My laptop card has 15458 MiB free, but has problems loading the model directly from HF.
 # This code fixes it using the "no_init" method by @kurumuz
 # found in https://discord.com/channels/729741769192767510/851918317039255592/916463267264725083
 # it is both loads faster and fixes the intermediate out-of-memory
 # I now have 2564 MiB left over
 #
 # BEFORE LOADING
 # index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
 # 0, NVIDIA GeForce RTX 3080 Laptop GPU, 471.41, 16384 MiB, 926 MiB, 15458 MiB
 #
 # AFTER LOADING
 # index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
 # 0, NVIDIA GeForce RTX 3080 Laptop GPU, 471.41, 16384 MiB, 13820 MiB, 2564 MiB

 # Conda installs :
 # conda install transformers
 # conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch

 from transformers import AutoTokenizer,AutoModelForCausalLM
 import torch
 import sys
 print('__Python VERSION:', sys.version)
 print('__pyTorch VERSION:', torch.__version__)
 print('__CUDA VERSION', )
 from subprocess import call
 # call(["nvcc", "--version"]) #does not work
 #! nvcc --version
 print('__CUDNN VERSION:', torch.backends.cudnn.version())
 print('__Number CUDA Devices:', torch.cuda.device_count())
 print('__Devices')
 print()
 print("BEFORE")
 call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
 print()
 print('Active CUDA Device: GPU', torch.cuda.current_device())
 print ('Available devices ', torch.cuda.device_count())
 print ('Current cuda device ', torch.cuda.current_device())


 print("Start GPT-j-6B")

 if torch.cuda.is_available():
    print("Using GPU")
    device = torch.device("cuda")
 else: 
    print("Using CPU")
    device = torch.device("cpu")


 def no_init(loading_code):
    def dummy(self):
        return
    
    modules = [torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm]
    original = {}
    for mod in modules:
        original[mod] = mod.reset_parameters
        mod.reset_parameters = dummy
    
    result = loading_code()
    for mod in modules:
        mod.reset_parameters = original[mod]
    
    return result
    

 print("   Load Model")

 #model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
 model = no_init(lambda: AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision='float16', torch_dtype=torch.float16, low_cpu_mem_usage=True)) #.to(device,  torch_dtype=torch.float16)

 print("   Move Model")
 model.to(device)

 print("   Load Tokenizer")
 #tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
 tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16")


 print("   Tokenize and Generate")
 prompt = "Faced with the problem of universal happiness, the great oracle GPT-J-6B spoke wisely when it said:'"
 input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

 generated_ids = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=200)
 generated_text = tokenizer.decode(generated_ids[0])
 print()
 print("Prompt:",prompt)
 print("Generated:",generated_text)

 print("Done J-6B")
 print()
 print("AFTER")
 call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
 print()
	# Running GPT-J-6B on a 3080 (semi-)direct from HuggingFace
	#
	# My laptop card has 15458 MiB free, but has problems loading the model directly from HF.
	# This code fixes it using the "no_init" method by @kurumuz
	# found in https://discord.com/channels/729741769192767510/851918317039255592/916463267264725083
	# it is both loads faster and fixes the intermediate out-of-memory
	# I now have 2564 MiB left over
	#
	# BEFORE LOADING
	# index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
	# 0, NVIDIA GeForce RTX 3080 Laptop GPU, 471.41, 16384 MiB, 926 MiB, 15458 MiB
	#
	# AFTER LOADING
	# index, name, driver_version, memory.total [MiB], memory.used [MiB], memory.free [MiB]
	# 0, NVIDIA GeForce RTX 3080 Laptop GPU, 471.41, 16384 MiB, 13820 MiB, 2564 MiB

	# Conda installs :
	# conda install transformers
	# conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch

	from transformers import AutoTokenizer,AutoModelForCausalLM
	import torch
	import sys
	print('__Python VERSION:', sys.version)
	print('__pyTorch VERSION:', torch.__version__)
	print('__CUDA VERSION', )
	from subprocess import call
	# call(["nvcc", "--version"]) #does not work
	#! nvcc --version
	print('__CUDNN VERSION:', torch.backends.cudnn.version())
	print('__Number CUDA Devices:', torch.cuda.device_count())
	print('__Devices')
	print()
	print("BEFORE")
	call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
	print()
	print('Active CUDA Device: GPU', torch.cuda.current_device())
	print ('Available devices ', torch.cuda.device_count())
	print ('Current cuda device ', torch.cuda.current_device())


	print("Start GPT-j-6B")

	if torch.cuda.is_available():
	print("Using GPU")
	device = torch.device("cuda")
	else:
	print("Using CPU")
	device = torch.device("cpu")


	def no_init(loading_code):
	def dummy(self):
	return

	modules = [torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm]
	original = {}
	for mod in modules:
	original[mod] = mod.reset_parameters
	mod.reset_parameters = dummy

	result = loading_code()
	for mod in modules:
	mod.reset_parameters = original[mod]

	return result


	print(" Load Model")

	#model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
	model = no_init(lambda: AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision='float16', torch_dtype=torch.float16, low_cpu_mem_usage=True)) #.to(device, torch_dtype=torch.float16)

	print(" Move Model")
	model.to(device)

	print(" Load Tokenizer")
	#tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
	tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16")


	print(" Tokenize and Generate")
	prompt = "Faced with the problem of universal happiness, the great oracle GPT-J-6B spoke wisely when it said:'"
	input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

	generated_ids = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=200)
	generated_text = tokenizer.decode(generated_ids[0])
	print()
	print("Prompt:",prompt)
	print("Generated:",generated_text)

	print("Done J-6B")
	print()
	print("AFTER")
	call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
	print()