Created
April 29, 2025 14:25
-
-
Save Norod/bf4843647c686a1d90d4a0f54b337186 to your computer and use it in GitHub Desktop.
Detect objects with Florence2, create and image mask and inpaint with GPT-Image-1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from PIL import Image | |
from PIL import ImageDraw # Added for mask creation | |
from transformers import AutoProcessor, AutoModelForCausalLM | |
################################################# | |
INPUT_IMAGE_SIZE_PORTRAIT = (1024, 1536) | |
INPUT_IMAGE_SIZE_LANDSCAPE = (1536, 1024) | |
INPUT_IMAGE_SIZE_SQUARE = (1024, 1024) | |
################################################# | |
INPUT_IMAGE_NAME = "./IMG_0330.jpg" # Path to your image | |
INPUT_IMAGE_SIZE = INPUT_IMAGE_SIZE_PORTRAIT # Set the desired input image size | |
PROMPT_FOR_OBJECTS_TO_DETECT = "picture frame" | |
PROMPT_FOR_EDITING = "famous painting" | |
################################################# | |
IMAGE_NAME_FOR_ARTIFACTS = INPUT_IMAGE_NAME.split("/")[-1].split(".")[0].replace(" ", "_") | |
IMAGE_NAME_FOR_INPUT_ARTIFACT = IMAGE_NAME_FOR_ARTIFACTS + "_input.jpg" | |
IMAGE_NAME_FOR_MASK_ARTIFACT = IMAGE_NAME_FOR_ARTIFACTS + "_mask.png" | |
IMAGE_NAME_FOR_OUTPUT_ARTIFACT = IMAGE_NAME_FOR_ARTIFACTS + "_output.png" | |
# Device should be cuda if available otherwise mps if avilable otherwise cpu | |
device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" | |
torch_dtype = torch.float16 if device == "cuda:0" else torch.float16 if device == "mps" else torch.float32 | |
print(f"Using device: {device} with dtype: {torch_dtype}") | |
model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base-ft", torch_dtype=torch_dtype, trust_remote_code=True).to(device) | |
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True) | |
def edit_image_with_openai(image_path, mask_path, prompt): | |
""" | |
Call OpenAI Images Edit API to inpaint transparent areas of mask. | |
""" | |
from openai import OpenAI | |
import base64 | |
client = OpenAI() | |
with open(image_path, "rb") as img_file, open(mask_path, "rb") as mask_file: | |
result = client.images.edit( | |
model="gpt-image-1", | |
image=img_file, | |
mask=mask_file, | |
prompt=prompt | |
) | |
image_base64 = result.data[0].b64_json | |
image_bytes = base64.b64decode(image_base64) | |
with open("IMG_0330_edited.png", "wb") as out_file: | |
out_file.write(image_bytes) | |
def run_example(task_prompt, image_name, text_input=None): | |
image = Image.open(image_name) | |
if text_input is None: | |
prompt = task_prompt | |
else: | |
prompt = task_prompt + text_input | |
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype) | |
generated_ids = model.generate( | |
input_ids=inputs["input_ids"], | |
pixel_values=inputs["pixel_values"], | |
max_new_tokens=1024, | |
num_beams=3 | |
) | |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] | |
parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height)) | |
return parsed_answer # Changed from print to return | |
def resize_and_save_image(image_name, size, output_path): | |
""" | |
Resize and save the image to the specified path. | |
""" | |
image = Image.open(image_name) | |
resized_image = image.resize(size).convert("RGB") | |
resized_image.save(output_path) | |
# declare main | |
if __name__ == "__main__": | |
# Run phrase grounding for "picture frame" | |
task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>" | |
resize_and_save_image(INPUT_IMAGE_NAME, INPUT_IMAGE_SIZE, IMAGE_NAME_FOR_INPUT_ARTIFACT) | |
results = run_example(task_prompt, IMAGE_NAME_FOR_INPUT_ARTIFACT, PROMPT_FOR_OBJECTS_TO_DETECT) | |
# Build mask: transparent inside any detected "picture frame" boxes | |
image = Image.open(IMAGE_NAME_FOR_INPUT_ARTIFACT) | |
data = results[task_prompt] | |
mask = Image.new("RGBA", image.size, (0, 0, 0, 255)) | |
draw = ImageDraw.Draw(mask) | |
for bbox, label in zip(data["bboxes"], data["labels"]): | |
if label.lower() == PROMPT_FOR_OBJECTS_TO_DETECT.lower(): | |
x1, y1, x2, y2 = bbox | |
draw.rectangle([x1, y1, x2, y2], fill=(0, 0, 0, 0)) | |
mask.save(IMAGE_NAME_FOR_MASK_ARTIFACT) | |
# Call OpenAI Images Edit API to inpaint the detected picture frames | |
edit_image_with_openai( | |
image_path=IMAGE_NAME_FOR_INPUT_ARTIFACT, | |
mask_path=IMAGE_NAME_FOR_MASK_ARTIFACT, | |
prompt=PROMPT_FOR_EDITING | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment