fuhoi · October 19, 2025 12:03
diff --git a/pdf_to_png_to_genai.py b/pdf_to_png_to_genai.py
 # %%
 %pip install --upgrade PyMuPDF
 %pip install --upgrade google-genai
 %pip install --upgrade Pillow
 %pip install --upgrade pydantic

 # %%
 PDF_DIR = "pdf-2"
 PDF_SRC = "Cart Receipt 165082.pdf"
 GOOGLE_API_KEY = "AIzaSyAT9WatjYx6mJbfZadw_v1jLcjjjnGEuz0"
 MODEL_ID = "gemini-2.5-flash"

 # %%
 from pathlib import Path
 directory_path = Path(PDF_DIR)
 directory_path.mkdir(parents=True, exist_ok=True)

 # %%
 import sys, pymupdf  # import the bindings
 fname = PDF_SRC # sys.argv[1]  # get filename from command line
 doc = pymupdf.open(fname)  # open document
 for page in doc:  # iterate through the pages
    pix = page.get_pixmap(dpi=300)  # render page to an image
    pdf_file_path = f"{PDF_DIR}/{PDF_SRC}__page-{page.number}.png"
    pix.save(pdf_file_path)  # store image as a PNG

 # %%
 for file in Path(PDF_DIR).glob("*.png"):
    print(file)

 # %%
 from google import genai
 from google.genai import types

 client = genai.Client(api_key=GOOGLE_API_KEY)

 # %%
 from pydantic import BaseModel

 class BoundingBox(BaseModel):
    """
    Represents a bounding box with its 2D coordinates and associated label.

    Attributes:
        box_2d (list[int]): A list of integers representing the 2D coordinates of the bounding box,
                            typically in the format [y_min, x_min, y_max, x_max].
        label (str): A string representing the label or class associated with the object within the bounding box.
    """

    box_2d: list[int]
    label: str

 # %%
 def call_gen_content(file_path):

    with open(file_path, 'rb') as f:
        pdf_bytes = f.read()

    response = client.models.generate_content(
        model=MODEL_ID,
        contents=[
            types.Part.from_bytes(
                data=pdf_bytes,
                mime_type='image/png',
            ),
            'Extract all text boxes from the PDF and return their bounding boxes in JSON format.'
        ],
        config=types.GenerateContentConfig(
            system_instruction="""
                Return bounding boxes as an array with labels.
                Never return masks. Limit to 200 objects.
                If an object is present multiple times, give each object a unique label
                according to its distinct characteristics (colors, size, position, etc..).
                """,
            temperature=0.5,
            safety_settings=[
                types.SafetySetting(
                    category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
                    threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH,
                ),
            ],
            response_mime_type="application/json",
            response_schema=list[BoundingBox],
        )
    )
    print(response.text)

    with open(f"{file_path}__model_output.txt", "w") as f:
        f.write(response.text)

 # %%
 for file in Path(PDF_DIR).glob("*.png"):
    print(file)
    call_gen_content(str(file))

 # %%
 from PIL import Image, ImageColor, ImageDraw

 def plot_bounding_boxes(image_uri: str, bounding_boxes: list[BoundingBox]) -> None:
    """
    Plots bounding boxes on an image with labels, using PIL and normalized coordinates.

    Args:
        image_uri: The URI of the image file.
        bounding_boxes: A list of BoundingBox objects. Each box's coordinates are in
                        normalized [y_min, x_min, y_max, x_max] format.
    """
    # with Image.open(requests.get(image_uri, stream=True, timeout=10).raw) as im:
    with Image.open(image_uri) as im:
        width, height = im.size
        draw = ImageDraw.Draw(im)

        colors = list(ImageColor.colormap.keys())

        for i, bbox in enumerate(bounding_boxes):
            # Scale normalized coordinates to image dimensions
            abs_y_min = int(bbox['box_2d'][0] / 1000 * height)
            abs_x_min = int(bbox['box_2d'][1] / 1000 * width)
            abs_y_max = int(bbox['box_2d'][2] / 1000 * height)
            abs_x_max = int(bbox['box_2d'][3] / 1000 * width)

            color = colors[i % len(colors)]

            # Draw the rectangle using the correct (x, y) pairs
            draw.rectangle(
                ((abs_x_min, abs_y_min), (abs_x_max, abs_y_max)),
                outline=color,
                width=4,
            )
            if bbox['label']:
                # Position the text at the top-left corner of the box
                draw.text((abs_x_min + 8, abs_y_min + 6), bbox['label'], fill=color)

        im.show()

 # %%
 import json

 for file in Path(PDF_DIR).glob("*__model_output.txt"):
    print(file)
    with open(str(file), 'r') as f:
        text_content = f.read()
    json_object = json.loads(text_content)
    img_file_path = str(file).replace("__model_output.txt", "")
    plot_bounding_boxes(img_file_path, json_object)
	# %%
	%pip install --upgrade PyMuPDF
	%pip install --upgrade google-genai
	%pip install --upgrade Pillow
	%pip install --upgrade pydantic

	# %%
	PDF_DIR = "pdf-2"
	PDF_SRC = "Cart Receipt 165082.pdf"
	GOOGLE_API_KEY = "AIzaSyAT9WatjYx6mJbfZadw_v1jLcjjjnGEuz0"
	MODEL_ID = "gemini-2.5-flash"

	# %%
	from pathlib import Path
	directory_path = Path(PDF_DIR)
	directory_path.mkdir(parents=True, exist_ok=True)

	# %%
	import sys, pymupdf # import the bindings
	fname = PDF_SRC # sys.argv[1] # get filename from command line
	doc = pymupdf.open(fname) # open document
	for page in doc: # iterate through the pages
	pix = page.get_pixmap(dpi=300) # render page to an image
	pdf_file_path = f"{PDF_DIR}/{PDF_SRC}__page-{page.number}.png"
	pix.save(pdf_file_path) # store image as a PNG

	# %%
	for file in Path(PDF_DIR).glob("*.png"):
	print(file)

	# %%
	from google import genai
	from google.genai import types

	client = genai.Client(api_key=GOOGLE_API_KEY)

	# %%
	from pydantic import BaseModel

	class BoundingBox(BaseModel):
	"""
	Represents a bounding box with its 2D coordinates and associated label.

	Attributes:
	box_2d (list[int]): A list of integers representing the 2D coordinates of the bounding box,
	typically in the format [y_min, x_min, y_max, x_max].
	label (str): A string representing the label or class associated with the object within the bounding box.
	"""

	box_2d: list[int]
	label: str

	# %%
	def call_gen_content(file_path):

	with open(file_path, 'rb') as f:
	pdf_bytes = f.read()

	response = client.models.generate_content(
	model=MODEL_ID,
	contents=[
	types.Part.from_bytes(
	data=pdf_bytes,
	mime_type='image/png',
	),
	'Extract all text boxes from the PDF and return their bounding boxes in JSON format.'
	],
	config=types.GenerateContentConfig(
	system_instruction="""
	Return bounding boxes as an array with labels.
	Never return masks. Limit to 200 objects.
	If an object is present multiple times, give each object a unique label
	according to its distinct characteristics (colors, size, position, etc..).
	""",
	temperature=0.5,
	safety_settings=[
	types.SafetySetting(
	category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
	threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH,
	),
	],
	response_mime_type="application/json",
	response_schema=list[BoundingBox],
	)
	)
	print(response.text)

	with open(f"{file_path}__model_output.txt", "w") as f:
	f.write(response.text)

	# %%
	for file in Path(PDF_DIR).glob("*.png"):
	print(file)
	call_gen_content(str(file))

	# %%
	from PIL import Image, ImageColor, ImageDraw

	def plot_bounding_boxes(image_uri: str, bounding_boxes: list[BoundingBox]) -> None:
	"""
	Plots bounding boxes on an image with labels, using PIL and normalized coordinates.

	Args:
	image_uri: The URI of the image file.
	bounding_boxes: A list of BoundingBox objects. Each box's coordinates are in
	normalized [y_min, x_min, y_max, x_max] format.
	"""
	# with Image.open(requests.get(image_uri, stream=True, timeout=10).raw) as im:
	with Image.open(image_uri) as im:
	width, height = im.size
	draw = ImageDraw.Draw(im)

	colors = list(ImageColor.colormap.keys())

	for i, bbox in enumerate(bounding_boxes):
	# Scale normalized coordinates to image dimensions
	abs_y_min = int(bbox['box_2d'][0] / 1000 * height)
	abs_x_min = int(bbox['box_2d'][1] / 1000 * width)
	abs_y_max = int(bbox['box_2d'][2] / 1000 * height)
	abs_x_max = int(bbox['box_2d'][3] / 1000 * width)

	color = colors[i % len(colors)]

	# Draw the rectangle using the correct (x, y) pairs
	draw.rectangle(
	((abs_x_min, abs_y_min), (abs_x_max, abs_y_max)),
	outline=color,
	width=4,
	)
	if bbox['label']:
	# Position the text at the top-left corner of the box
	draw.text((abs_x_min + 8, abs_y_min + 6), bbox['label'], fill=color)

	im.show()

	# %%
	import json

	for file in Path(PDF_DIR).glob("*__model_output.txt"):
	print(file)
	with open(str(file), 'r') as f:
	text_content = f.read()
	json_object = json.loads(text_content)
	img_file_path = str(file).replace("__model_output.txt", "")
	plot_bounding_boxes(img_file_path, json_object)