Skip to content

Instantly share code, notes, and snippets.

@jondurbin
Created October 2, 2025 13:53
Show Gist options
  • Save jondurbin/4a541632e236579c6e0f2d9f5f524b83 to your computer and use it in GitHub Desktop.
Save jondurbin/4a541632e236579c6e0f2d9f5f524b83 to your computer and use it in GitHub Desktop.
dots.ocr example
import json
import requests
import base64
import openai
import os
client = openai.Client(base_url="https://llm.chutes.ai/v1", api_key=os.getenv("CHUTES_API_KEY"))
prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
1. Bbox format: [x1, y1, x2, y2]
2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
3. Text Extraction & Formatting Rules:
- Picture: For the 'Picture' category, the text field should be omitted.
- Formula: Format its text as LaTeX.
- Table: Format its text as HTML.
- All Others (Text, Title, etc.): Format their text as Markdown.
4. Constraints:
- The output text must be the original text from the image, with no translation.
- All layout elements must be sorted according to human reading order.
5. Final Output: The entire output must be a single JSON object.
"""
# Read a document (converted to image first).
with open("document.png", "rb") as infile:
base64_str = base64.b64encode(infile.read()).decode()
image_data = f"data:image/png;base64,{base64_str}"
# Send a chat request to dots.ocr
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_data},
},
{
"type": "text",
"text": f"<|img|><|imgpad|><|endofimg|>{prompt}"
},
],
}
]
response = client.chat.completions.create(
messages=messages,
model="rednote-hilab/dots.ocr",
temperature=0.9,
)
print(json.dumps(json.loads(response.choices[0].message.content), indent=2))
@jondurbin
Copy link
Author

document
$ python dots_example.py
[
  {
    "bbox": [
      21,
      33,
      217,
      88
    ],
    "category": "Page-header",
    "text": "Chutes"
  },
  {
    "bbox": [
      285,
      39,
      530,
      78
    ],
    "category": "Page-header",
    "text": "Create an Account"
  },
  {
    "bbox": [
      1264,
      37,
      1500,
      80
    ],
    "category": "Page-header",
    "text": "Q Search Chutes"
  },
  ...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment