Norod · April 29, 2025 14:25
diff --git a/test-florence2-gpt-image-1.py b/test-florence2-gpt-image-1.py
 import torch

 from PIL import Image
 from PIL import ImageDraw  # Added for mask creation
 from transformers import AutoProcessor, AutoModelForCausalLM 

 #################################################
 INPUT_IMAGE_SIZE_PORTRAIT = (1024, 1536)  
 INPUT_IMAGE_SIZE_LANDSCAPE = (1536, 1024)  
 INPUT_IMAGE_SIZE_SQUARE = (1024, 1024)
 #################################################

 INPUT_IMAGE_NAME = "./IMG_0330.jpg"  # Path to your image
 INPUT_IMAGE_SIZE = INPUT_IMAGE_SIZE_PORTRAIT  # Set the desired input image size

 PROMPT_FOR_OBJECTS_TO_DETECT = "picture frame"
 PROMPT_FOR_EDITING = "famous painting"

 #################################################

 IMAGE_NAME_FOR_ARTIFACTS = INPUT_IMAGE_NAME.split("/")[-1].split(".")[0].replace(" ", "_")
 IMAGE_NAME_FOR_INPUT_ARTIFACT = IMAGE_NAME_FOR_ARTIFACTS + "_input.jpg"
 IMAGE_NAME_FOR_MASK_ARTIFACT = IMAGE_NAME_FOR_ARTIFACTS + "_mask.png"
 IMAGE_NAME_FOR_OUTPUT_ARTIFACT = IMAGE_NAME_FOR_ARTIFACTS + "_output.png"

 # Device should be cuda if available otherwise mps if avilable otherwise cpu
 device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
 torch_dtype = torch.float16 if device == "cuda:0" else torch.float16 if device == "mps" else torch.float32

 print(f"Using device: {device} with dtype: {torch_dtype}")

 model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base-ft", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
 processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True)

 def edit_image_with_openai(image_path, mask_path, prompt):
    """
    Call OpenAI Images Edit API to inpaint transparent areas of mask.
    """
    from openai import OpenAI
    import base64
    client = OpenAI()
    with open(image_path, "rb") as img_file, open(mask_path, "rb") as mask_file:
        result = client.images.edit(
            model="gpt-image-1",
            image=img_file,
            mask=mask_file,
            prompt=prompt
        )
    image_base64 = result.data[0].b64_json
    image_bytes = base64.b64decode(image_base64)
    with open("IMG_0330_edited.png", "wb") as out_file:
        out_file.write(image_bytes)

 def run_example(task_prompt, image_name, text_input=None):
    image = Image.open(image_name)
    
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
    generated_ids = model.generate(
      input_ids=inputs["input_ids"],
      pixel_values=inputs["pixel_values"],
      max_new_tokens=1024,
      num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
    return parsed_answer  # Changed from print to return

 def resize_and_save_image(image_name, size, output_path):
    """
    Resize and save the image to the specified path.
    """
    image = Image.open(image_name)
    resized_image = image.resize(size).convert("RGB")
    resized_image.save(output_path)
    

 # declare main
 if __name__ == "__main__":
    # Run phrase grounding for "picture frame"
    task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>"
    
    resize_and_save_image(INPUT_IMAGE_NAME, INPUT_IMAGE_SIZE, IMAGE_NAME_FOR_INPUT_ARTIFACT)
    results = run_example(task_prompt, IMAGE_NAME_FOR_INPUT_ARTIFACT, PROMPT_FOR_OBJECTS_TO_DETECT)

    # Build mask: transparent inside any detected "picture frame" boxes
    image = Image.open(IMAGE_NAME_FOR_INPUT_ARTIFACT)
    data = results[task_prompt]
    mask = Image.new("RGBA", image.size, (0, 0, 0, 255))
    draw = ImageDraw.Draw(mask)
    for bbox, label in zip(data["bboxes"], data["labels"]):
        if label.lower() == PROMPT_FOR_OBJECTS_TO_DETECT.lower():
            x1, y1, x2, y2 = bbox
            draw.rectangle([x1, y1, x2, y2], fill=(0, 0, 0, 0))
    mask.save(IMAGE_NAME_FOR_MASK_ARTIFACT)

    # Call OpenAI Images Edit API to inpaint the detected picture frames
    edit_image_with_openai(
        image_path=IMAGE_NAME_FOR_INPUT_ARTIFACT,
        mask_path=IMAGE_NAME_FOR_MASK_ARTIFACT,
        prompt=PROMPT_FOR_EDITING
    )
	import torch

	from PIL import Image
	from PIL import ImageDraw # Added for mask creation
	from transformers import AutoProcessor, AutoModelForCausalLM

	#################################################
	INPUT_IMAGE_SIZE_PORTRAIT = (1024, 1536)
	INPUT_IMAGE_SIZE_LANDSCAPE = (1536, 1024)
	INPUT_IMAGE_SIZE_SQUARE = (1024, 1024)
	#################################################

	INPUT_IMAGE_NAME = "./IMG_0330.jpg" # Path to your image
	INPUT_IMAGE_SIZE = INPUT_IMAGE_SIZE_PORTRAIT # Set the desired input image size

	PROMPT_FOR_OBJECTS_TO_DETECT = "picture frame"
	PROMPT_FOR_EDITING = "famous painting"

	#################################################

	IMAGE_NAME_FOR_ARTIFACTS = INPUT_IMAGE_NAME.split("/")[-1].split(".")[0].replace(" ", "_")
	IMAGE_NAME_FOR_INPUT_ARTIFACT = IMAGE_NAME_FOR_ARTIFACTS + "_input.jpg"
	IMAGE_NAME_FOR_MASK_ARTIFACT = IMAGE_NAME_FOR_ARTIFACTS + "_mask.png"
	IMAGE_NAME_FOR_OUTPUT_ARTIFACT = IMAGE_NAME_FOR_ARTIFACTS + "_output.png"

	# Device should be cuda if available otherwise mps if avilable otherwise cpu
	device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
	torch_dtype = torch.float16 if device == "cuda:0" else torch.float16 if device == "mps" else torch.float32

	print(f"Using device: {device} with dtype: {torch_dtype}")

	model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base-ft", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
	processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True)

	def edit_image_with_openai(image_path, mask_path, prompt):
	"""
	Call OpenAI Images Edit API to inpaint transparent areas of mask.
	"""
	from openai import OpenAI
	import base64
	client = OpenAI()
	with open(image_path, "rb") as img_file, open(mask_path, "rb") as mask_file:
	result = client.images.edit(
	model="gpt-image-1",
	image=img_file,
	mask=mask_file,
	prompt=prompt
	)
	image_base64 = result.data[0].b64_json
	image_bytes = base64.b64decode(image_base64)
	with open("IMG_0330_edited.png", "wb") as out_file:
	out_file.write(image_bytes)

	def run_example(task_prompt, image_name, text_input=None):
	image = Image.open(image_name)

	if text_input is None:
	prompt = task_prompt
	else:
	prompt = task_prompt + text_input
	inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
	generated_ids = model.generate(
	input_ids=inputs["input_ids"],
	pixel_values=inputs["pixel_values"],
	max_new_tokens=1024,
	num_beams=3
	)
	generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
	parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
	return parsed_answer # Changed from print to return

	def resize_and_save_image(image_name, size, output_path):
	"""
	Resize and save the image to the specified path.
	"""
	image = Image.open(image_name)
	resized_image = image.resize(size).convert("RGB")
	resized_image.save(output_path)


	# declare main
	if __name__ == "__main__":
	# Run phrase grounding for "picture frame"
	task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>"

	resize_and_save_image(INPUT_IMAGE_NAME, INPUT_IMAGE_SIZE, IMAGE_NAME_FOR_INPUT_ARTIFACT)
	results = run_example(task_prompt, IMAGE_NAME_FOR_INPUT_ARTIFACT, PROMPT_FOR_OBJECTS_TO_DETECT)

	# Build mask: transparent inside any detected "picture frame" boxes
	image = Image.open(IMAGE_NAME_FOR_INPUT_ARTIFACT)
	data = results[task_prompt]
	mask = Image.new("RGBA", image.size, (0, 0, 0, 255))
	draw = ImageDraw.Draw(mask)
	for bbox, label in zip(data["bboxes"], data["labels"]):
	if label.lower() == PROMPT_FOR_OBJECTS_TO_DETECT.lower():
	x1, y1, x2, y2 = bbox
	draw.rectangle([x1, y1, x2, y2], fill=(0, 0, 0, 0))
	mask.save(IMAGE_NAME_FOR_MASK_ARTIFACT)

	# Call OpenAI Images Edit API to inpaint the detected picture frames
	edit_image_with_openai(
	image_path=IMAGE_NAME_FOR_INPUT_ARTIFACT,
	mask_path=IMAGE_NAME_FOR_MASK_ARTIFACT,
	prompt=PROMPT_FOR_EDITING
	)