Skip to content

Instantly share code, notes, and snippets.

@johnidm
Created June 7, 2025 16:01
Show Gist options
  • Save johnidm/73ff17ad74aa699a9b66d9da4e48063f to your computer and use it in GitHub Desktop.
Save johnidm/73ff17ad74aa699a9b66d9da4e48063f to your computer and use it in GitHub Desktop.
Using LLM to processo PDF file

https://community.openai.com/t/how-does-openai-charge-tokens-when-sending-pdf-content-in-a-prompt/1280985

OpenAI

from openai import OpenAI
import base64
from pydantic import BaseModel

client = OpenAI(api_key="sk-proj-fj0o--eGHLNfe")

file_name = "doc2.pdf"

with open(file_name, "rb") as file:
    file_data = file.read()
    base64_data = base64.b64encode(file_data).decode('utf-8')


class DataExtractor(BaseModel):
    email: str


response = client.beta.chat.completions.parse(
  model="gpt-4.1-mini",
  messages=[
    {
      "role": "system",
      "content": [
        {
          "type": "text",
          "text": "You are a expert to extract data from a PDF files."
        }
      ]
    },
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "PDF file:\n"
        },
        {
          "type": "file",
          "file": {
            "file_data": "data:application/pdf;base64," + base64_data,
            "filename": file_name,
          }
        }
      ]
    }
  ],
  response_format=DataExtractor
)

data : DataExtractor = response.choices[0].message.parsed
print(data)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment