Skip to content

Instantly share code, notes, and snippets.

@Norod
Created April 29, 2025 14:25
Show Gist options
  • Save Norod/bf4843647c686a1d90d4a0f54b337186 to your computer and use it in GitHub Desktop.
Save Norod/bf4843647c686a1d90d4a0f54b337186 to your computer and use it in GitHub Desktop.
Detect objects with Florence2, create and image mask and inpaint with GPT-Image-1
import torch
from PIL import Image
from PIL import ImageDraw # Added for mask creation
from transformers import AutoProcessor, AutoModelForCausalLM
#################################################
INPUT_IMAGE_SIZE_PORTRAIT = (1024, 1536)
INPUT_IMAGE_SIZE_LANDSCAPE = (1536, 1024)
INPUT_IMAGE_SIZE_SQUARE = (1024, 1024)
#################################################
INPUT_IMAGE_NAME = "./IMG_0330.jpg" # Path to your image
INPUT_IMAGE_SIZE = INPUT_IMAGE_SIZE_PORTRAIT # Set the desired input image size
PROMPT_FOR_OBJECTS_TO_DETECT = "picture frame"
PROMPT_FOR_EDITING = "famous painting"
#################################################
IMAGE_NAME_FOR_ARTIFACTS = INPUT_IMAGE_NAME.split("/")[-1].split(".")[0].replace(" ", "_")
IMAGE_NAME_FOR_INPUT_ARTIFACT = IMAGE_NAME_FOR_ARTIFACTS + "_input.jpg"
IMAGE_NAME_FOR_MASK_ARTIFACT = IMAGE_NAME_FOR_ARTIFACTS + "_mask.png"
IMAGE_NAME_FOR_OUTPUT_ARTIFACT = IMAGE_NAME_FOR_ARTIFACTS + "_output.png"
# Device should be cuda if available otherwise mps if avilable otherwise cpu
device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda:0" else torch.float16 if device == "mps" else torch.float32
print(f"Using device: {device} with dtype: {torch_dtype}")
model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base-ft", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True)
def edit_image_with_openai(image_path, mask_path, prompt):
"""
Call OpenAI Images Edit API to inpaint transparent areas of mask.
"""
from openai import OpenAI
import base64
client = OpenAI()
with open(image_path, "rb") as img_file, open(mask_path, "rb") as mask_file:
result = client.images.edit(
model="gpt-image-1",
image=img_file,
mask=mask_file,
prompt=prompt
)
image_base64 = result.data[0].b64_json
image_bytes = base64.b64decode(image_base64)
with open("IMG_0330_edited.png", "wb") as out_file:
out_file.write(image_bytes)
def run_example(task_prompt, image_name, text_input=None):
image = Image.open(image_name)
if text_input is None:
prompt = task_prompt
else:
prompt = task_prompt + text_input
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
num_beams=3
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
return parsed_answer # Changed from print to return
def resize_and_save_image(image_name, size, output_path):
"""
Resize and save the image to the specified path.
"""
image = Image.open(image_name)
resized_image = image.resize(size).convert("RGB")
resized_image.save(output_path)
# declare main
if __name__ == "__main__":
# Run phrase grounding for "picture frame"
task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>"
resize_and_save_image(INPUT_IMAGE_NAME, INPUT_IMAGE_SIZE, IMAGE_NAME_FOR_INPUT_ARTIFACT)
results = run_example(task_prompt, IMAGE_NAME_FOR_INPUT_ARTIFACT, PROMPT_FOR_OBJECTS_TO_DETECT)
# Build mask: transparent inside any detected "picture frame" boxes
image = Image.open(IMAGE_NAME_FOR_INPUT_ARTIFACT)
data = results[task_prompt]
mask = Image.new("RGBA", image.size, (0, 0, 0, 255))
draw = ImageDraw.Draw(mask)
for bbox, label in zip(data["bboxes"], data["labels"]):
if label.lower() == PROMPT_FOR_OBJECTS_TO_DETECT.lower():
x1, y1, x2, y2 = bbox
draw.rectangle([x1, y1, x2, y2], fill=(0, 0, 0, 0))
mask.save(IMAGE_NAME_FOR_MASK_ARTIFACT)
# Call OpenAI Images Edit API to inpaint the detected picture frames
edit_image_with_openai(
image_path=IMAGE_NAME_FOR_INPUT_ARTIFACT,
mask_path=IMAGE_NAME_FOR_MASK_ARTIFACT,
prompt=PROMPT_FOR_EDITING
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment