Last active
July 29, 2023 20:46
-
-
Save takuma104/e2139bda7f74cd977350e18500156683 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import json | |
from diffusers import StableDiffusionPipeline | |
def print_memory_usage(width, height, batch, xformers, with_lora): | |
def on_off(cond): | |
return 'ON' if cond else 'OFF' | |
mem_bytes = torch.cuda.max_memory_allocated() | |
mem_MB = int(mem_bytes/(10**6)) | |
dict = {'width':width, 'height':height, 'batch':batch, 'xformers':on_off(xformers), | |
'lora':on_off(with_lora), 'mem_MB':mem_MB} | |
print(json.dumps(dict)) | |
if __name__ == "__main__": | |
prompt = "A pokemon with blue eyes." | |
sd_model_id = "runwayml/stable-diffusion-v1-5" | |
lora_weight_model_id = "sayakpaul/sd-model-finetuned-lora-t4" | |
for xformers in [False, True]: | |
for batch in [1, 2, 4]: | |
for width, height in [(512, 512), (512, 768)]: | |
for with_lora in [False, True]: | |
torch.cuda.reset_peak_memory_stats() | |
pipe = StableDiffusionPipeline.from_pretrained( | |
sd_model_id, torch_dtype=torch.float16, safety_checker=None | |
).to("cuda") | |
if xformers: | |
pipe.enable_xformers_memory_efficient_attention() | |
else: | |
pipe.disable_xformers_memory_efficient_attention() | |
pipe.set_progress_bar_config(disable=True) | |
if with_lora: | |
pipe.load_lora_weights(lora_weight_model_id) | |
pipe(prompt=prompt, width=width, height=height, num_inference_steps=3, | |
num_images_per_prompt=batch, generator=torch.manual_seed(0)) | |
print_memory_usage(width, height, batch, xformers, with_lora) | |
del pipe |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hey I was looking through hugging-face LoRA discussion and found myself here. Did you discover something about memory efficiency of LoRA? The paper says 3 times lower GPU memory usage. But I can't replicate it. Conceptually, if we modify the output of the first layer of a model, the next layer of the model will see changes due LoRA and so the full attention matrix of the following layers need to be kept to calculate the gradients of the LoRA weights in the first layer. So this means the memory usage of the LoRA model is almost the same as fully training the model. Is this what you discovered or I am misunderstanding something? Sorry I'm writing here suddenly. I don't really have any coworkers that I can discuss this with.