jondurbin · October 2, 2025 13:53 · jondurbin · Oct 2, 2025
diff --git a/dots_example.py b/dots_example.py
 import json
 import requests
 import base64
 import openai
 import os

 client = openai.Client(base_url="https://llm.chutes.ai/v1", api_key=os.getenv("CHUTES_API_KEY"))

 prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.

 1. Bbox format: [x1, y1, x2, y2]

 2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].

 3. Text Extraction & Formatting Rules:
    - Picture: For the 'Picture' category, the text field should be omitted.
    - Formula: Format its text as LaTeX.
    - Table: Format its text as HTML.
    - All Others (Text, Title, etc.): Format their text as Markdown.

 4. Constraints:
    - The output text must be the original text from the image, with no translation.
    - All layout elements must be sorted according to human reading order.

 5. Final Output: The entire output must be a single JSON object.
 """

 # Read a document (converted to image first).
 with open("document.png", "rb") as infile:
    base64_str = base64.b64encode(infile.read()).decode()
    image_data = f"data:image/png;base64,{base64_str}"

 # Send a chat request to dots.ocr
 messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": {"url":  image_data},
            },
            {
                "type": "text",
                "text": f"<|img|><|imgpad|><|endofimg|>{prompt}"
            },
        ],
    }
 ]
 response = client.chat.completions.create(
    messages=messages,
    model="rednote-hilab/dots.ocr",
    temperature=0.9,
 )
 print(json.dumps(json.loads(response.choices[0].message.content), indent=2))
	import json
	import requests
	import base64
	import openai
	import os

	client = openai.Client(base_url="https://llm.chutes.ai/v1", api_key=os.getenv("CHUTES_API_KEY"))

	prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.

	1. Bbox format: [x1, y1, x2, y2]

	2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].

	3. Text Extraction & Formatting Rules:
	- Picture: For the 'Picture' category, the text field should be omitted.
	- Formula: Format its text as LaTeX.
	- Table: Format its text as HTML.
	- All Others (Text, Title, etc.): Format their text as Markdown.

	4. Constraints:
	- The output text must be the original text from the image, with no translation.
	- All layout elements must be sorted according to human reading order.

	5. Final Output: The entire output must be a single JSON object.
	"""

	# Read a document (converted to image first).
	with open("document.png", "rb") as infile:
	base64_str = base64.b64encode(infile.read()).decode()
	image_data = f"data:image/png;base64,{base64_str}"

	# Send a chat request to dots.ocr
	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": "image_url",
	"image_url": {"url": image_data},
	},
	{
	"type": "text",
	"text": f"<\|img\|><\|imgpad\|><\|endofimg\|>{prompt}"
	},
	],
	}
	]
	response = client.chat.completions.create(
	messages=messages,
	model="rednote-hilab/dots.ocr",
	temperature=0.9,
	)
	print(json.dumps(json.loads(response.choices[0].message.content), indent=2))