sayakpaul/run_flux_under_24gbs.py

toilaluan · 2024-08-02T11:14:42Z

Simply version that works well on RTX 4090. Avg 4.5->5s for text encoder, 1->1.2 it/s for denoise model.

from transformers import T5EncoderModel
import time
import gc
import torch
import diffusers

def flush():
    gc.collect()
    torch.cuda.empty_cache()

t5_encoder = T5EncoderModel.from_pretrained(
    "black-forest-labs/FLUX.1-schnell", subfolder="text_encoder_2", revision="refs/pr/7", torch_dtype=torch.bfloat16
)
text_encoder = diffusers.DiffusionPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-schnell",
    text_encoder_2=t5_encoder,
    transformer=None,
    vae=None,
    revision="refs/pr/7",
)
pipeline = diffusers.DiffusionPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-schnell", 
    torch_dtype=torch.bfloat16,
    revision="refs/pr/1",
    text_encoder_2=None,
    text_encoder=None,
)
pipeline.enable_model_cpu_offload()

@torch.inference_mode()
def inference(self, prompt, num_inference_steps=4, guidance_scale=0.0, width=1024, height=1024):
    self.text_encoder.to("cuda")
    start = time.time()
    (
        prompt_embeds,
        pooled_prompt_embeds,
        _,
    ) = self.text_encoder.encode_prompt(prompt=prompt, prompt_2=None, max_sequence_length=256)
    self.text_encoder.to("cpu")
    flush()
    print(f"Prompt encoding time: {time.time() - start}")
    output = self.pipeline(
        prompt_embeds=prompt_embeds.bfloat16(),
        pooled_prompt_embeds=pooled_prompt_embeds.bfloat16(),
        width=width,
        height=height,
        guidance_scale=guidance_scale,
        num_inference_steps=num_inference_steps
    )
    image = output.images[0]
    return image

sayakpaul · 2024-08-02T11:27:16Z

Thanks! This is lots cleaner.

bil-ash · 2024-08-03T02:20:46Z

Simply version that works well on RTX 4090. Avg 4.5->5s for text encoder, 1->1.2 it/s for denoise model.

from transformers import T5EncoderModel
import time
import gc
import torch
import diffusers

def flush():
    gc.collect()
    torch.cuda.empty_cache()

t5_encoder = T5EncoderModel.from_pretrained(
    "black-forest-labs/FLUX.1-schnell", subfolder="text_encoder_2", revision="refs/pr/7", torch_dtype=torch.bfloat16
)
text_encoder = diffusers.DiffusionPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-schnell",
    text_encoder_2=t5_encoder,
    transformer=None,
    vae=None,
    revision="refs/pr/7",
)
pipeline = diffusers.DiffusionPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-schnell", 
    torch_dtype=torch.bfloat16,
    revision="refs/pr/1",
    text_encoder_2=None,
    text_encoder=None,
)
pipeline.enable_model_cpu_offload()

@torch.inference_mode()
def inference(self, prompt, num_inference_steps=4, guidance_scale=0.0, width=1024, height=1024):
    self.text_encoder.to("cuda")
    start = time.time()
    (
        prompt_embeds,
        pooled_prompt_embeds,
        _,
    ) = self.text_encoder.encode_prompt(prompt=prompt, prompt_2=None, max_sequence_length=256)
    self.text_encoder.to("cpu")
    flush()
    print(f"Prompt encoding time: {time.time() - start}")
    output = self.pipeline(
        prompt_embeds=prompt_embeds.bfloat16(),
        pooled_prompt_embeds=pooled_prompt_embeds.bfloat16(),
        width=width,
        height=height,
        guidance_scale=guidance_scale,
        num_inference_steps=num_inference_steps
    )
    image = output.images[0]
    return image

@toilaluan Can you make it work under 16GB VRAM so that it can run on RTX A4000?

sayakpaul · 2024-08-03T04:02:51Z

Refer to https://gist.github.com/AmericanPresidentJimmyCarter/873985638e1f3541ba8b00137e7dacd9

james-imi · 2024-08-08T04:45:31Z

@bil-ash why is inference using self parameters?

mortenmoulder · 2024-08-19T21:56:08Z

@toilaluan I've been trying to run your example, although unsuccessfully. I keep getting errors such as:

AttributeError: 'NoneType' object has no attribute 'to'

I'm guessing that's because self.text_encoder is null (or equivalent in Python).

Removing the self portion of the code does work, but on my 4090 it utilizes the GPU 100% in both RAM and computing, but your seconds are minutes on my end. Any clue what I'm doing wrong?

Oh and I just added this at the bottom:

if __name__ == "__main__":
    prompt = "A cat holding a sign that says hello world"
    generated_image = inference(pipeline, prompt)

    generated_image.save("output.png")

sumitmamoria · 2024-08-20T03:20:00Z

Simply version that works well on RTX 4090. Avg 4.5->5s for text encoder, 1->1.2 it/s for denoise model.

from transformers import T5EncoderModel
import time
import gc
import torch
import diffusers

def flush():
    gc.collect()
    torch.cuda.empty_cache()

t5_encoder = T5EncoderModel.from_pretrained(
    "black-forest-labs/FLUX.1-schnell", subfolder="text_encoder_2", revision="refs/pr/7", torch_dtype=torch.bfloat16
)
text_encoder = diffusers.DiffusionPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-schnell",
    text_encoder_2=t5_encoder,
    transformer=None,
    vae=None,
    revision="refs/pr/7",
)
pipeline = diffusers.DiffusionPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-schnell", 
    torch_dtype=torch.bfloat16,
    revision="refs/pr/1",
    text_encoder_2=None,
    text_encoder=None,
)
pipeline.enable_model_cpu_offload()

@torch.inference_mode()
def inference(self, prompt, num_inference_steps=4, guidance_scale=0.0, width=1024, height=1024):
    self.text_encoder.to("cuda")
    start = time.time()
    (
        prompt_embeds,
        pooled_prompt_embeds,
        _,
    ) = self.text_encoder.encode_prompt(prompt=prompt, prompt_2=None, max_sequence_length=256)
    self.text_encoder.to("cpu")
    flush()
    print(f"Prompt encoding time: {time.time() - start}")
    output = self.pipeline(
        prompt_embeds=prompt_embeds.bfloat16(),
        pooled_prompt_embeds=pooled_prompt_embeds.bfloat16(),
        width=width,
        height=height,
        guidance_scale=guidance_scale,
        num_inference_steps=num_inference_steps
    )
    image = output.images[0]
    return image

How to use a Lora with this code? Thanks!

jason-engage · 2024-08-21T13:36:22Z

@toilaluan You're refactor works great. Anybody know how to get the image-to-image to work with 4090? I haven't been able to modify this script successfully.

jordyBonnet · 2024-09-04T10:39:18Z

@toilaluan I've been trying to run your example, although unsuccessfully. I keep getting errors such as:
AttributeError: 'NoneType' object has no attribute 'to'
I'm guessing that's because self.text_encoder is null (or equivalent in Python).

Removing the self portion of the code does work, but on my 4090 it utilizes the GPU 100% in both RAM and computing, but your seconds are minutes on my end. Any clue what I'm doing wrong?

Oh and I just added this at the bottom:
if __name__ == "__main__":
    prompt = "A cat holding a sign that says hello world"
    generated_image = inference(pipeline, prompt)

    generated_image.save("output.png")

same on my side (win 11)

fnauman · 2024-12-03T08:30:16Z

Only works for 512x512 image size on a 4090 for me, 1024x1024 goes out of memory.

Example:
python main.py "a photo of a dog with cat-like look"

from transformers import T5EncoderModel
import time
import gc
import torch
import diffusers
import argparse
from PIL import Image
import os

def flush():
    gc.collect()
    torch.cuda.empty_cache()

class FluxSchnell:
    def __init__(self):
        self.t5_encoder = T5EncoderModel.from_pretrained(
            "black-forest-labs/FLUX.1-schnell", subfolder="text_encoder_2", revision="refs/pr/7", torch_dtype=torch.bfloat16
        )
        self.text_encoder = diffusers.DiffusionPipeline.from_pretrained(
            "black-forest-labs/FLUX.1-schnell",
            text_encoder_2=self.t5_encoder,
            transformer=None,
            vae=None,
            revision="refs/pr/7",
        )
        self.pipeline = diffusers.DiffusionPipeline.from_pretrained(
            "black-forest-labs/FLUX.1-schnell", 
            torch_dtype=torch.bfloat16,
            revision="refs/pr/1",
            text_encoder_2=None,
            text_encoder=None,
        )
        self.pipeline.enable_model_cpu_offload()

    @torch.inference_mode()
    def inference(self, prompt, num_inference_steps=4, guidance_scale=0.0, width=1024, height=1024):
        self.text_encoder.to("cuda")
        start = time.time()
        (
            prompt_embeds,
            pooled_prompt_embeds,
            _,
        ) = self.text_encoder.encode_prompt(prompt=prompt, prompt_2=None, max_sequence_length=256)
        self.text_encoder.to("cpu")

        flush()
        
        print(f"Prompt encoding time: {time.time() - start}")
        
        output = self.pipeline(
            prompt_embeds=prompt_embeds.bfloat16(),
            pooled_prompt_embeds=pooled_prompt_embeds.bfloat16(),
            width=width,
            height=height,
            guidance_scale=guidance_scale,
            num_inference_steps=num_inference_steps
        )
        image = output.images[0]
        return image


def main():
    parser = argparse.ArgumentParser(description='Generate images using FLUX.1-schnell model')
    parser.add_argument('prompt', type=str, help='Enter your prompt here')
    parser.add_argument('--steps', type=int, default=4, help='Number of inference steps (default: 4)')
    parser.add_argument('--guidance-scale', type=float, default=0.0, help='Guidance scale (default: 0.0)')
    parser.add_argument('--width', type=int, default=512, help='Image width (default: 512)')
    parser.add_argument('--height', type=int, default=512, help='Image height (default: 512)')
    parser.add_argument('--output', type=str, default='output.png', help='Output image path (default: output.png)')
    
    args = parser.parse_args()
    
    model = FluxSchnell()
    image = model.inference(
        prompt=args.prompt,
        num_inference_steps=args.steps,
        guidance_scale=args.guidance_scale,
        width=args.width,
        height=args.height
    )
    
    # Save the image
    image.save(args.output)
    print(f"Image saved to: {os.path.abspath(args.output)}")

if __name__ == '__main__':
    main()

jayendramadaram · 2025-01-12T07:42:10Z

Awesome! works smoothly on 4090! any idea how to run it with lora ? loading lora into pipeline and running it is not working

thomsonm685 · 2025-01-15T20:29:11Z

Awesome! works smoothly on 4090! any idea how to run it with lora ? loading lora into pipeline and running it is not working

@jayendramadaram , Here's my code loading my LORA (you'll need to 'pip install peft'):

from transformers import T5EncoderModel
import time
import gc
import torch
import diffusers

def flush():
    gc.collect()
    torch.cuda.empty_cache()
    
class FluxSchnell:
    def __init__(self):
        self.t5_encoder = T5EncoderModel.from_pretrained(
            "black-forest-labs/FLUX.1-schnell", subfolder="text_encoder_2", revision="refs/pr/7", torch_dtype=torch.bfloat16
        )
        self.text_encoder = diffusers.DiffusionPipeline.from_pretrained(
            "black-forest-labs/FLUX.1-schnell",
            text_encoder_2=self.t5_encoder,
            transformer=None,
            vae=None,
            revision="refs/pr/7",
        )
        self.pipeline = diffusers.DiffusionPipeline.from_pretrained(
            "black-forest-labs/FLUX.1-schnell", 
            torch_dtype=torch.bfloat16,
            revision="refs/pr/1",
            text_encoder_2=None,
            text_encoder=None,
        )
        self.pipeline.enable_model_cpu_offload()
        self.pipeline.load_lora_weights("./path/to/lora")

    print("HERE")

    @torch.inference_mode()
    def inference(self, prompt, filename, num_inference_steps=4, guidance_scale=0.0, width=1024, height=1024):
        self.text_encoder.to("cuda")
        start = time.time()
        (
            prompt_embeds,
            pooled_prompt_embeds,
            _,
        ) = self.text_encoder.encode_prompt(prompt=prompt, prompt_2=None, max_sequence_length=256)
        self.text_encoder.to("cpu")
        flush()
        print(f"Prompt encoding time: {time.time() - start}")
        output = self.pipeline(
            prompt_embeds=prompt_embeds.bfloat16(),
            pooled_prompt_embeds=pooled_prompt_embeds.bfloat16(),
            width=width,
            height=height,
            guidance_scale=guidance_scale,
            num_inference_steps=num_inference_steps
        )
        image = output.images[0]
    
        image.save(filename)
        print("FINISHED")
        return image

model = FluxSchnell()
generated_image = model.inference(prompt="some really cool stuff", filename="output2.png")

Vladimir-Urik · 2025-03-13T22:58:40Z

torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 90.00 MiB. GPU 0 has a total capacity of 19.56 GiB of which 6.25 MiB is free. Including non-PyTorch memory, this process has 19.54 GiB memory in use. Of the allocated memory 19.35 GiB is allocated by PyTorch, and 6.14 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

	from diffusers import FluxPipeline, AutoencoderKL
	from diffusers.image_processor import VaeImageProcessor
	from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel
	import torch
	import gc


	def flush():
	gc.collect()
	torch.cuda.empty_cache()
	torch.cuda.reset_max_memory_allocated()
	torch.cuda.reset_peak_memory_stats()


	def bytes_to_giga_bytes(bytes):
	return bytes / 1024 / 1024 / 1024


	flush()

	ckpt_id = "black-forest-labs/FLUX.1-schnell"
	prompt = "a photo of a dog with cat-like look"

	text_encoder = CLIPTextModel.from_pretrained(
	ckpt_id, subfolder="text_encoder", torch_dtype=torch.bfloat16
	)
	text_encoder_2 = T5EncoderModel.from_pretrained(
	ckpt_id, subfolder="text_encoder_2", torch_dtype=torch.bfloat16
	)
	tokenizer = CLIPTokenizer.from_pretrained(ckpt_id, subfolder="tokenizer")
	tokenizer_2 = T5TokenizerFast.from_pretrained(ckpt_id, subfolder="tokenizer_2")

	pipeline = FluxPipeline.from_pretrained(
	ckpt_id,
	text_encoder=text_encoder,
	text_encoder_2=text_encoder_2,
	tokenizer=tokenizer,
	tokenizer_2=tokenizer_2,
	transformer=None,
	vae=None,
	).to("cuda")

	with torch.no_grad():
	print("Encoding prompts.")
	prompt_embeds, pooled_prompt_embeds, text_ids = pipeline.encode_prompt(
	prompt=prompt, prompt_2=None, max_sequence_length=256
	)

	del text_encoder
	del text_encoder_2
	del tokenizer
	del tokenizer_2
	del pipeline

	flush()

	pipeline = FluxPipeline.from_pretrained(
	ckpt_id,
	text_encoder=None,
	text_encoder_2=None,
	tokenizer=None,
	tokenizer_2=None,
	vae=None,
	torch_dtype=torch.bfloat16,
	).to("cuda")

	print("Running denoising.")
	height, width = 768, 1360
	# No need to wrap it up under `torch.no_grad()` as pipeline call method
	# is already wrapped under that.
	latents = pipeline(
	prompt_embeds=prompt_embeds,
	pooled_prompt_embeds=pooled_prompt_embeds,
	num_inference_steps=4,
	guidance_scale=0.0,
	height=height,
	width=width,
	output_type="latent",
	).images
	print(f"{latents.shape=}")

	del pipeline.transformer
	del pipeline

	flush()

	vae = AutoencoderKL.from_pretrained(ckpt_id, revision="refs/pr/1", subfolder="vae", torch_dtype=torch.bfloat16).to(
	"cuda"
	)
	vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
	image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)

	with torch.no_grad():
	print("Running decoding.")
	latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
	latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor

	image = vae.decode(latents, return_dict=False)[0]
	image = image_processor.postprocess(image, output_type="pil")
	image[0].save("image.png")

sayakpaul/run_flux_under_24gbs.py

toilaluan commented Aug 2, 2024

Uh oh!

sayakpaul commented Aug 2, 2024

Uh oh!

bil-ash commented Aug 3, 2024

Uh oh!

sayakpaul commented Aug 3, 2024

Uh oh!

james-imi commented Aug 8, 2024

Uh oh!

mortenmoulder commented Aug 19, 2024

Uh oh!

sumitmamoria commented Aug 20, 2024

Uh oh!

jason-engage commented Aug 21, 2024

Uh oh!

jordyBonnet commented Sep 4, 2024

Uh oh!

fnauman commented Dec 3, 2024

Uh oh!

jayendramadaram commented Jan 12, 2025

Uh oh!

thomsonm685 commented Jan 15, 2025

Uh oh!

Vladimir-Urik commented Mar 13, 2025

Uh oh!