-
-
Save philschmid/45bc63d0293530f93c3069361df7dbac to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from google import genai | |
from pydantic import BaseModel, Field | |
# create client | |
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY","xxx")) | |
class PageText(BaseModel): | |
"""Represents the content of a page in the PDF document in markdown format.""" | |
text: str = Field(description="Extracted text of the page in markdown format.") | |
EXTRACT_MARKDOWN = """ | |
You are given an image of a page from a PDF document. | |
Your task is to extract the text accurately and convert it into Markdown format, ensuring proper structure and readability. | |
Requirements: | |
- Preserve Formatting: Maintain bold, italics, headings, lists, and other structural elements as they appear in the original text. | |
- Extract Tables: If a table is present and can be accurately extracted, convert it into Markdown table format. | |
- Charts & Graphics: | |
- If the page contains a chart, table, or graphic that cannot be extracted as text, insert a contextually recognizable placeholder where it appears (e.g., [CHART: Sales Performance Q1], [GRAPHIC: Workflow Diagram], [TABLE: Financial Summary]). | |
- The placeholder should be descriptive based on the content it represents to ensure easy identification later. | |
- Use this format for each placeholder: "[TYPE: Description]" (e.g., [CHART: Sales Performance Q1 - Bar Chart]). | |
- Accuracy: Ensure correct text extraction without modifying the content. | |
- No Extra Output: Return only the formatted Markdown text—no explanations, comments, or additional information. | |
- If the image is empty or contains no text, return an empty string. | |
""" # noqa: E501 | |
for img_path in ["../assets/2.png", "../assets/7.png", "../assets/9.png"]: | |
# Upload the file to the File API | |
image = client.files.upload(file=img_path) | |
for i in range(10): | |
print(f"Attempt {i+1} of 10 for {img_path}") | |
try: | |
response = client.models.generate_content( | |
model="gemini-2.5-flash-preview-04-17", | |
contents=[EXTRACT_MARKDOWN, image], | |
config={ | |
'response_mime_type': 'application/json', | |
'response_schema': PageText, | |
}) | |
print(response.parsed.text) | |
except Exception as e: | |
print(e) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment