Last active
July 18, 2025 11:51
-
-
Save fancellu/2ad4038eb8e871a70ff70f59b7bb4567 to your computer and use it in GitHub Desktop.
Gradio demo for FluxContextDev but 4bit quant, fits on a 4090
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gradio as gr | |
import numpy as np | |
import spaces | |
import torch | |
import random | |
from PIL import Image | |
from diffusers import FluxKontextPipeline | |
from diffusers.utils import load_image | |
# Import quantization libraries | |
try: | |
import bitsandbytes | |
import hqq | |
print("Quantization libraries loaded") | |
except ImportError as e: | |
print(f"Quantization libraries not available: {e}") | |
MAX_SEED = np.iinfo(np.int32).max | |
# Try HQQ 4-bit quantized version for much lower VRAM usage | |
try: | |
pipe = FluxKontextPipeline.from_pretrained( | |
"HighCWu/FLUX.1-Kontext-dev-bnb-hqq-4bit", | |
torch_dtype=torch.bfloat16 | |
).to("cuda") | |
print("Using HQQ 4-bit quantized model") | |
except Exception as e: | |
print(f"HQQ model failed: {e}") | |
# Fallback to original with aggressive optimizations | |
pipe = FluxKontextPipeline.from_pretrained( | |
"black-forest-labs/FLUX.1-Kontext-dev", | |
torch_dtype=torch.bfloat16, | |
low_cpu_mem_usage=True | |
) | |
pipe.enable_sequential_cpu_offload() | |
print("Using original model with CPU offload") | |
@spaces.GPU | |
def infer(input_image, prompt, seed=42, randomize_seed=False, guidance_scale=2.5, steps=28, | |
progress=gr.Progress(track_tqdm=True)): | |
if randomize_seed: | |
seed = random.randint(0, MAX_SEED) | |
if input_image: | |
input_image = input_image.convert("RGB") | |
image = pipe( | |
image=input_image, | |
prompt=prompt, | |
guidance_scale=guidance_scale, | |
width=input_image.size[0], | |
height=input_image.size[1], | |
num_inference_steps=steps, | |
generator=torch.Generator().manual_seed(seed), | |
).images[0] | |
else: | |
image = pipe( | |
prompt=prompt, | |
guidance_scale=guidance_scale, | |
num_inference_steps=steps, | |
generator=torch.Generator().manual_seed(seed), | |
).images[0] | |
return image, seed, gr.Button(visible=True) | |
@spaces.GPU | |
def infer_example(input_image, prompt): | |
image, seed, _ = infer(input_image, prompt) | |
return image, seed | |
css = """ | |
#col-container { | |
margin: 0 auto; | |
max-width: 960px; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
with gr.Column(elem_id="col-container"): | |
gr.Markdown(f"""# FLUX.1 Kontext [dev] - HQQ 4-bit | |
Image editing model with 4-bit quantization for lower VRAM usage | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
input_image = gr.Image(label="Upload the image for editing", type="pil") | |
with gr.Row(): | |
prompt = gr.Text( | |
label="Prompt", | |
show_label=False, | |
max_lines=1, | |
placeholder="Enter your prompt for editing (e.g., 'Remove glasses', 'Add a hat')", | |
container=False, | |
) | |
run_button = gr.Button("Run", scale=0) | |
with gr.Accordion("Advanced Settings", open=False): | |
seed = gr.Slider( | |
label="Seed", | |
minimum=0, | |
maximum=MAX_SEED, | |
step=1, | |
value=0, | |
) | |
randomize_seed = gr.Checkbox(label="Randomize seed", value=True) | |
guidance_scale = gr.Slider( | |
label="Guidance Scale", | |
minimum=1, | |
maximum=10, | |
step=0.1, | |
value=2.5, | |
) | |
steps = gr.Slider( | |
label="Steps", | |
minimum=1, | |
maximum=30, | |
value=28, | |
step=1 | |
) | |
with gr.Column(): | |
result = gr.Image(label="Result", show_label=False, interactive=False) | |
reuse_button = gr.Button("Reuse this image", visible=False) | |
examples = gr.Examples( | |
examples=[ | |
["flowers.png", "turn the flowers into sunflowers"], | |
["monster.png", "make this monster ride a skateboard on the beach"], | |
["cat.png", "make this cat happy"] | |
], | |
inputs=[input_image, prompt], | |
outputs=[result, seed], | |
fn=infer_example, | |
cache_examples="lazy" | |
) | |
gr.on( | |
triggers=[run_button.click, prompt.submit], | |
fn=infer, | |
inputs=[input_image, prompt, seed, randomize_seed, guidance_scale, steps], | |
outputs=[result, seed, reuse_button] | |
) | |
reuse_button.click( | |
fn=lambda image: image, | |
inputs=[result], | |
outputs=[input_image] | |
) | |
demo.launch(mcp_server=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
torch | |
torchvision | |
torchaudio | |
gradio | |
diffusers | |
transformers>=4.53.1 | |
numpy<2 | |
bitsandbytes | |
hqq | |
pillow | |
spaces |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The P100 is old old old. 2016. Unfortunately. Its memory management is simply not up to HQQ needs. Even if it could run, it would run sooooo slowly
Pascal Architecture Limitations:
No Tensor Cores: Critical for AI inference acceleration
Compute Capability 6.0: Limited optimization support
Older Memory Interface: HBM2 at lower speeds
Limited Quantization Support: Reduced compatibility with modern methods
It isn't supported by Nvidia on that model.
https://docs.nvidia.com/nim/visual-genai/1.1.1/support-matrix.html