Skip to content

Instantly share code, notes, and snippets.

@philschmid
Created May 12, 2025 09:16
Show Gist options
  • Save philschmid/45bc63d0293530f93c3069361df7dbac to your computer and use it in GitHub Desktop.
Save philschmid/45bc63d0293530f93c3069361df7dbac to your computer and use it in GitHub Desktop.
import os
from google import genai
from pydantic import BaseModel, Field
# create client
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY","xxx"))
class PageText(BaseModel):
"""Represents the content of a page in the PDF document in markdown format."""
text: str = Field(description="Extracted text of the page in markdown format.")
EXTRACT_MARKDOWN = """
You are given an image of a page from a PDF document.
Your task is to extract the text accurately and convert it into Markdown format, ensuring proper structure and readability.
Requirements:
- Preserve Formatting: Maintain bold, italics, headings, lists, and other structural elements as they appear in the original text.
- Extract Tables: If a table is present and can be accurately extracted, convert it into Markdown table format.
- Charts & Graphics:
- If the page contains a chart, table, or graphic that cannot be extracted as text, insert a contextually recognizable placeholder where it appears (e.g., [CHART: Sales Performance Q1], [GRAPHIC: Workflow Diagram], [TABLE: Financial Summary]).
- The placeholder should be descriptive based on the content it represents to ensure easy identification later.
- Use this format for each placeholder: "[TYPE: Description]" (e.g., [CHART: Sales Performance Q1 - Bar Chart]).
- Accuracy: Ensure correct text extraction without modifying the content.
- No Extra Output: Return only the formatted Markdown text—no explanations, comments, or additional information.
- If the image is empty or contains no text, return an empty string.
""" # noqa: E501
for img_path in ["../assets/2.png", "../assets/7.png", "../assets/9.png"]:
# Upload the file to the File API
image = client.files.upload(file=img_path)
for i in range(10):
print(f"Attempt {i+1} of 10 for {img_path}")
try:
response = client.models.generate_content(
model="gemini-2.5-flash-preview-04-17",
contents=[EXTRACT_MARKDOWN, image],
config={
'response_mime_type': 'application/json',
'response_schema': PageText,
})
print(response.parsed.text)
except Exception as e:
print(e)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment