Skip to content

Instantly share code, notes, and snippets.

@fuhoi
Created October 19, 2025 12:03
Show Gist options
  • Save fuhoi/a1317d1262099274f18ab59313e29d97 to your computer and use it in GitHub Desktop.
Save fuhoi/a1317d1262099274f18ab59313e29d97 to your computer and use it in GitHub Desktop.
# %%
%pip install --upgrade PyMuPDF
%pip install --upgrade google-genai
%pip install --upgrade Pillow
%pip install --upgrade pydantic
# %%
PDF_DIR = "pdf-2"
PDF_SRC = "Cart Receipt 165082.pdf"
GOOGLE_API_KEY = "AIzaSyAT9WatjYx6mJbfZadw_v1jLcjjjnGEuz0"
MODEL_ID = "gemini-2.5-flash"
# %%
from pathlib import Path
directory_path = Path(PDF_DIR)
directory_path.mkdir(parents=True, exist_ok=True)
# %%
import sys, pymupdf # import the bindings
fname = PDF_SRC # sys.argv[1] # get filename from command line
doc = pymupdf.open(fname) # open document
for page in doc: # iterate through the pages
pix = page.get_pixmap(dpi=300) # render page to an image
pdf_file_path = f"{PDF_DIR}/{PDF_SRC}__page-{page.number}.png"
pix.save(pdf_file_path) # store image as a PNG
# %%
for file in Path(PDF_DIR).glob("*.png"):
print(file)
# %%
from google import genai
from google.genai import types
client = genai.Client(api_key=GOOGLE_API_KEY)
# %%
from pydantic import BaseModel
class BoundingBox(BaseModel):
"""
Represents a bounding box with its 2D coordinates and associated label.
Attributes:
box_2d (list[int]): A list of integers representing the 2D coordinates of the bounding box,
typically in the format [y_min, x_min, y_max, x_max].
label (str): A string representing the label or class associated with the object within the bounding box.
"""
box_2d: list[int]
label: str
# %%
def call_gen_content(file_path):
with open(file_path, 'rb') as f:
pdf_bytes = f.read()
response = client.models.generate_content(
model=MODEL_ID,
contents=[
types.Part.from_bytes(
data=pdf_bytes,
mime_type='image/png',
),
'Extract all text boxes from the PDF and return their bounding boxes in JSON format.'
],
config=types.GenerateContentConfig(
system_instruction="""
Return bounding boxes as an array with labels.
Never return masks. Limit to 200 objects.
If an object is present multiple times, give each object a unique label
according to its distinct characteristics (colors, size, position, etc..).
""",
temperature=0.5,
safety_settings=[
types.SafetySetting(
category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH,
),
],
response_mime_type="application/json",
response_schema=list[BoundingBox],
)
)
print(response.text)
with open(f"{file_path}__model_output.txt", "w") as f:
f.write(response.text)
# %%
for file in Path(PDF_DIR).glob("*.png"):
print(file)
call_gen_content(str(file))
# %%
from PIL import Image, ImageColor, ImageDraw
def plot_bounding_boxes(image_uri: str, bounding_boxes: list[BoundingBox]) -> None:
"""
Plots bounding boxes on an image with labels, using PIL and normalized coordinates.
Args:
image_uri: The URI of the image file.
bounding_boxes: A list of BoundingBox objects. Each box's coordinates are in
normalized [y_min, x_min, y_max, x_max] format.
"""
# with Image.open(requests.get(image_uri, stream=True, timeout=10).raw) as im:
with Image.open(image_uri) as im:
width, height = im.size
draw = ImageDraw.Draw(im)
colors = list(ImageColor.colormap.keys())
for i, bbox in enumerate(bounding_boxes):
# Scale normalized coordinates to image dimensions
abs_y_min = int(bbox['box_2d'][0] / 1000 * height)
abs_x_min = int(bbox['box_2d'][1] / 1000 * width)
abs_y_max = int(bbox['box_2d'][2] / 1000 * height)
abs_x_max = int(bbox['box_2d'][3] / 1000 * width)
color = colors[i % len(colors)]
# Draw the rectangle using the correct (x, y) pairs
draw.rectangle(
((abs_x_min, abs_y_min), (abs_x_max, abs_y_max)),
outline=color,
width=4,
)
if bbox['label']:
# Position the text at the top-left corner of the box
draw.text((abs_x_min + 8, abs_y_min + 6), bbox['label'], fill=color)
im.show()
# %%
import json
for file in Path(PDF_DIR).glob("*__model_output.txt"):
print(file)
with open(str(file), 'r') as f:
text_content = f.read()
json_object = json.loads(text_content)
img_file_path = str(file).replace("__model_output.txt", "")
plot_bounding_boxes(img_file_path, json_object)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment