Detailed writeup: https://huggingface2.notion.site/How-to-split-Flux-transformer-and-run-inference-aa1583ad23ce47a78589a79bb9309ab0
But TLDR is we split the models where possible and decouple the different stages of pipeline
# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, |
Detailed writeup: https://huggingface2.notion.site/How-to-split-Flux-transformer-and-run-inference-aa1583ad23ce47a78589a79bb9309ab0
But TLDR is we split the models where possible and decouple the different stages of pipeline
from diffusers import AutoPipelineForText2Image
import torch
pipeline = AutoPipelineForText2Image.from_pretrained(
"black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16
).to("cuda")
pipeline.load_lora_weights("sayakpaul/yarn_art_lora_flux", weight_name="pytorch_lora_weights.safetensors")
image = pipeline("a puppy in a pond, yarn art style", guidance_scale=3.5, height=768).images[0]
image.save("yarn.png")
import torch | |
from huggingface_hub import hf_hub_download | |
from diffusers import FluxTransformer2DModel, DiffusionPipeline | |
dtype, device = torch.bfloat16, "cuda" | |
ckpt_id = "black-forest-labs/FLUX.1-schnell" | |
with torch.device("meta"): | |
config = FluxTransformer2DModel.load_config(ckpt_id, subfolder="transformer") | |
model = FluxTransformer2DModel.from_config(config).to(dtype) |
# Originally by jiwooya1000, put together together by sayakpaul. | |
# Documentation: https://huggingface.co/docs/diffusers/main/en/training/distributed_inference | |
""" | |
Run: | |
accelerate launch distributed_inference_diffusers.py --batch_size 8 | |
# Enable memory optimizations for large models like SD3 | |
accelerate launch distributed_inference_diffusers.py --batch_size 8 --low_mem=1 |
Flux: https://blackforestlabs.ai/announcing-black-forest-labs/
torchao
The first resource even allows you to run the pipeline under 16GBs of GPU VRAM.
from diffusers import FluxPipeline, AutoencoderKL | |
from diffusers.image_processor import VaeImageProcessor | |
from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel | |
import torch | |
import gc | |
def flush(): | |
gc.collect() | |
torch.cuda.empty_cache() |
import torch | |
torch.set_float32_matmul_precision("high") | |
from diffusers import DiffusionPipeline | |
import time | |
pipeline_id = "ptx0/pixart-900m-1024-ft" | |
pipeline = DiffusionPipeline.from_pretrained( | |
pipeline_id, |
import torch | |
torch.set_float32_matmul_precision("high") | |
from diffusers import StableDiffusion3Pipeline | |
import time | |
id = "stabilityai/stable-diffusion-3-medium-diffusers" | |
pipeline = StableDiffusion3Pipeline.from_pretrained( | |
id, |
from diffusers import StableDiffusion3Pipeline | |
from transformers import T5EncoderModel | |
import torch | |
import time | |
import gc | |
def flush(): | |
gc.collect() | |
torch.cuda.empty_cache() |