Doron Adler Norod

Running Flux under limited resources with Diffusers

The first resource even allows you to run the pipeline under 16GBs of GPU VRAM.

Good question! I am collecting human data on how quantization affects outputs. See here for more information: ggml-org/llama.cpp#5962

In the meantime, use the largest that fully fits in your GPU. If you can comfortably fit Q4_K_S, try using a model with more parameters.

	from google import genai
	from google.genai import types
	import typing_extensions as typing
	from PIL import Image
	import requests
	import io
	import json
	import os

	import torch
	from diffusers import FluxTransformer2DModel
	import torch.utils.benchmark as benchmark
	from torchao.quantization import quantize_, int8_weight_only
	from torchao.utils import unwrap_tensor_subclass
	import torch._inductor

	torch._inductor.config.mixed_mm_choice = "triton"

	# download FluxCFGPipline
	!wget https://raw.githubusercontent.com/linoytsaban/diffusers/refs/heads/dreambooth-lora-flux-exploration/examples/community/pipeline_flux_with_cfg.py

	# load pipeline
	import diffusers
	import torch
	from pipeline_flux_with_cfg import FluxCFGPipeline

	pipe = FluxCFGPipeline.from_pretrained("black-forest-labs/FLUX.1-dev",
	torch_dtype=torch.bfloat16)

	import argparse
	import numpy as np
	import torch
	import torch.nn as nn
	import coremltools as ct
	from transformers import AutoTokenizer, AutoModelForCausalLM

	# When using float16, all predicted logits are 0. To be debugged.
	compute_precision = ct.precision.FLOAT32
	compute_units = ct.ComputeUnit.CPU_ONLY

	# Copyright 2023 Taiga Takano
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http:#www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,

	from pathlib import Path
	from diffusers import StableDiffusionXLPipeline
	import torch
	from safetensors.torch import save_file

	# text_encoder.text_model.encoder.layers.0.self_attn.k_proj.lora_linear_layer.down.weight
	# lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight
	# 1. text_encoder -> lora_te, text_encoder_2 -> lora_te2
	# 2. map
	# 3. .weight -> 2 .alpha -> 1 and replace . -> _

	import json
	import pickle
	import struct
	import zipfile
	import numpy as np
	from sentencepiece import SentencePieceProcessor

	def rms_norm(x): return (x / np.sqrt(np.square(x).mean(-1, keepdims=True) + 1e-6))
	def softmax(x): return (np.exp(x - np.max(x, axis=-1, keepdims=True))) / np.sum((np.exp(x - np.max(x, axis=-1, keepdims=True))), axis=-1, keepdims = True)