Created
October 19, 2025 12:03
-
-
Save fuhoi/a1317d1262099274f18ab59313e29d97 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # %% | |
| %pip install --upgrade PyMuPDF | |
| %pip install --upgrade google-genai | |
| %pip install --upgrade Pillow | |
| %pip install --upgrade pydantic | |
| # %% | |
| PDF_DIR = "pdf-2" | |
| PDF_SRC = "Cart Receipt 165082.pdf" | |
| GOOGLE_API_KEY = "AIzaSyAT9WatjYx6mJbfZadw_v1jLcjjjnGEuz0" | |
| MODEL_ID = "gemini-2.5-flash" | |
| # %% | |
| from pathlib import Path | |
| directory_path = Path(PDF_DIR) | |
| directory_path.mkdir(parents=True, exist_ok=True) | |
| # %% | |
| import sys, pymupdf # import the bindings | |
| fname = PDF_SRC # sys.argv[1] # get filename from command line | |
| doc = pymupdf.open(fname) # open document | |
| for page in doc: # iterate through the pages | |
| pix = page.get_pixmap(dpi=300) # render page to an image | |
| pdf_file_path = f"{PDF_DIR}/{PDF_SRC}__page-{page.number}.png" | |
| pix.save(pdf_file_path) # store image as a PNG | |
| # %% | |
| for file in Path(PDF_DIR).glob("*.png"): | |
| print(file) | |
| # %% | |
| from google import genai | |
| from google.genai import types | |
| client = genai.Client(api_key=GOOGLE_API_KEY) | |
| # %% | |
| from pydantic import BaseModel | |
| class BoundingBox(BaseModel): | |
| """ | |
| Represents a bounding box with its 2D coordinates and associated label. | |
| Attributes: | |
| box_2d (list[int]): A list of integers representing the 2D coordinates of the bounding box, | |
| typically in the format [y_min, x_min, y_max, x_max]. | |
| label (str): A string representing the label or class associated with the object within the bounding box. | |
| """ | |
| box_2d: list[int] | |
| label: str | |
| # %% | |
| def call_gen_content(file_path): | |
| with open(file_path, 'rb') as f: | |
| pdf_bytes = f.read() | |
| response = client.models.generate_content( | |
| model=MODEL_ID, | |
| contents=[ | |
| types.Part.from_bytes( | |
| data=pdf_bytes, | |
| mime_type='image/png', | |
| ), | |
| 'Extract all text boxes from the PDF and return their bounding boxes in JSON format.' | |
| ], | |
| config=types.GenerateContentConfig( | |
| system_instruction=""" | |
| Return bounding boxes as an array with labels. | |
| Never return masks. Limit to 200 objects. | |
| If an object is present multiple times, give each object a unique label | |
| according to its distinct characteristics (colors, size, position, etc..). | |
| """, | |
| temperature=0.5, | |
| safety_settings=[ | |
| types.SafetySetting( | |
| category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, | |
| threshold=types.HarmBlockThreshold.BLOCK_ONLY_HIGH, | |
| ), | |
| ], | |
| response_mime_type="application/json", | |
| response_schema=list[BoundingBox], | |
| ) | |
| ) | |
| print(response.text) | |
| with open(f"{file_path}__model_output.txt", "w") as f: | |
| f.write(response.text) | |
| # %% | |
| for file in Path(PDF_DIR).glob("*.png"): | |
| print(file) | |
| call_gen_content(str(file)) | |
| # %% | |
| from PIL import Image, ImageColor, ImageDraw | |
| def plot_bounding_boxes(image_uri: str, bounding_boxes: list[BoundingBox]) -> None: | |
| """ | |
| Plots bounding boxes on an image with labels, using PIL and normalized coordinates. | |
| Args: | |
| image_uri: The URI of the image file. | |
| bounding_boxes: A list of BoundingBox objects. Each box's coordinates are in | |
| normalized [y_min, x_min, y_max, x_max] format. | |
| """ | |
| # with Image.open(requests.get(image_uri, stream=True, timeout=10).raw) as im: | |
| with Image.open(image_uri) as im: | |
| width, height = im.size | |
| draw = ImageDraw.Draw(im) | |
| colors = list(ImageColor.colormap.keys()) | |
| for i, bbox in enumerate(bounding_boxes): | |
| # Scale normalized coordinates to image dimensions | |
| abs_y_min = int(bbox['box_2d'][0] / 1000 * height) | |
| abs_x_min = int(bbox['box_2d'][1] / 1000 * width) | |
| abs_y_max = int(bbox['box_2d'][2] / 1000 * height) | |
| abs_x_max = int(bbox['box_2d'][3] / 1000 * width) | |
| color = colors[i % len(colors)] | |
| # Draw the rectangle using the correct (x, y) pairs | |
| draw.rectangle( | |
| ((abs_x_min, abs_y_min), (abs_x_max, abs_y_max)), | |
| outline=color, | |
| width=4, | |
| ) | |
| if bbox['label']: | |
| # Position the text at the top-left corner of the box | |
| draw.text((abs_x_min + 8, abs_y_min + 6), bbox['label'], fill=color) | |
| im.show() | |
| # %% | |
| import json | |
| for file in Path(PDF_DIR).glob("*__model_output.txt"): | |
| print(file) | |
| with open(str(file), 'r') as f: | |
| text_content = f.read() | |
| json_object = json.loads(text_content) | |
| img_file_path = str(file).replace("__model_output.txt", "") | |
| plot_bounding_boxes(img_file_path, json_object) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment