Skip to content

Instantly share code, notes, and snippets.

@fancellu
Last active July 18, 2025 11:51
Show Gist options
  • Save fancellu/2ad4038eb8e871a70ff70f59b7bb4567 to your computer and use it in GitHub Desktop.
Save fancellu/2ad4038eb8e871a70ff70f59b7bb4567 to your computer and use it in GitHub Desktop.
Gradio demo for FluxContextDev but 4bit quant, fits on a 4090
import gradio as gr
import numpy as np
import spaces
import torch
import random
from PIL import Image
from diffusers import FluxKontextPipeline
from diffusers.utils import load_image
# Import quantization libraries
try:
import bitsandbytes
import hqq
print("Quantization libraries loaded")
except ImportError as e:
print(f"Quantization libraries not available: {e}")
MAX_SEED = np.iinfo(np.int32).max
# Try HQQ 4-bit quantized version for much lower VRAM usage
try:
pipe = FluxKontextPipeline.from_pretrained(
"HighCWu/FLUX.1-Kontext-dev-bnb-hqq-4bit",
torch_dtype=torch.bfloat16
).to("cuda")
print("Using HQQ 4-bit quantized model")
except Exception as e:
print(f"HQQ model failed: {e}")
# Fallback to original with aggressive optimizations
pipe = FluxKontextPipeline.from_pretrained(
"black-forest-labs/FLUX.1-Kontext-dev",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True
)
pipe.enable_sequential_cpu_offload()
print("Using original model with CPU offload")
@spaces.GPU
def infer(input_image, prompt, seed=42, randomize_seed=False, guidance_scale=2.5, steps=28,
progress=gr.Progress(track_tqdm=True)):
if randomize_seed:
seed = random.randint(0, MAX_SEED)
if input_image:
input_image = input_image.convert("RGB")
image = pipe(
image=input_image,
prompt=prompt,
guidance_scale=guidance_scale,
width=input_image.size[0],
height=input_image.size[1],
num_inference_steps=steps,
generator=torch.Generator().manual_seed(seed),
).images[0]
else:
image = pipe(
prompt=prompt,
guidance_scale=guidance_scale,
num_inference_steps=steps,
generator=torch.Generator().manual_seed(seed),
).images[0]
return image, seed, gr.Button(visible=True)
@spaces.GPU
def infer_example(input_image, prompt):
image, seed, _ = infer(input_image, prompt)
return image, seed
css = """
#col-container {
margin: 0 auto;
max-width: 960px;
}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.Markdown(f"""# FLUX.1 Kontext [dev] - HQQ 4-bit
Image editing model with 4-bit quantization for lower VRAM usage
""")
with gr.Row():
with gr.Column():
input_image = gr.Image(label="Upload the image for editing", type="pil")
with gr.Row():
prompt = gr.Text(
label="Prompt",
show_label=False,
max_lines=1,
placeholder="Enter your prompt for editing (e.g., 'Remove glasses', 'Add a hat')",
container=False,
)
run_button = gr.Button("Run", scale=0)
with gr.Accordion("Advanced Settings", open=False):
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=MAX_SEED,
step=1,
value=0,
)
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
guidance_scale = gr.Slider(
label="Guidance Scale",
minimum=1,
maximum=10,
step=0.1,
value=2.5,
)
steps = gr.Slider(
label="Steps",
minimum=1,
maximum=30,
value=28,
step=1
)
with gr.Column():
result = gr.Image(label="Result", show_label=False, interactive=False)
reuse_button = gr.Button("Reuse this image", visible=False)
examples = gr.Examples(
examples=[
["flowers.png", "turn the flowers into sunflowers"],
["monster.png", "make this monster ride a skateboard on the beach"],
["cat.png", "make this cat happy"]
],
inputs=[input_image, prompt],
outputs=[result, seed],
fn=infer_example,
cache_examples="lazy"
)
gr.on(
triggers=[run_button.click, prompt.submit],
fn=infer,
inputs=[input_image, prompt, seed, randomize_seed, guidance_scale, steps],
outputs=[result, seed, reuse_button]
)
reuse_button.click(
fn=lambda image: image,
inputs=[result],
outputs=[input_image]
)
demo.launch(mcp_server=True)
torch
torchvision
torchaudio
gradio
diffusers
transformers>=4.53.1
numpy<2
bitsandbytes
hqq
pillow
spaces
@fancellu
Copy link
Author

fancellu commented Jul 7, 2025

flux4bitcat

@MohamedLahmeri01
Copy link

can you help me , i am trying to run it in kaggle p100 gpu , but got this error : "Quantization libraries loaded

Loading pipeline components...: 100%
 7/7 [00:11<00:00,  2.92s/it]

HQQ model failed: .to is not supported for HQQ-quantized models.
"

@fancellu
Copy link
Author

The P100 is old old old. 2016. Unfortunately. Its memory management is simply not up to HQQ needs. Even if it could run, it would run sooooo slowly

Pascal Architecture Limitations:

No Tensor Cores: Critical for AI inference acceleration

Compute Capability 6.0: Limited optimization support

Older Memory Interface: HBM2 at lower speeds

Limited Quantization Support: Reduced compatibility with modern methods

It isn't supported by Nvidia on that model.

https://docs.nvidia.com/nim/visual-genai/1.1.1/support-matrix.html

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment