Created
June 13, 2024 09:09
-
-
Save Erol444/9fa445a98853c5606a296611596fffe3 to your computer and use it in GitHub Desktop.
openai_parse_pdf_output_json.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from openai import OpenAI | |
from openai.types.beta.threads.message_create_params import Attachment, AttachmentToolFileSearch | |
import os | |
from dotenv import load_dotenv | |
load_dotenv() | |
import json | |
# Add your OpenAI API key | |
client = OpenAI(api_key=os.getenv("OPENAI_KEY")) | |
# Upload the file to the OpenAI API | |
file = client.files.create( | |
file=open('my_inference.pdf', 'rb'), | |
purpose='assistants' | |
) | |
# Create thread | |
thread = client.beta.threads.create() | |
# Add the user message to the thread | |
prompt = "Tell me 5 interesting things that you find in this PDF file." | |
client.beta.threads.messages.create( | |
thread_id = thread.id, | |
role='user', | |
content=prompt, | |
attachments=[Attachment(file_id=file.id, tools=[AttachmentToolFileSearch(type='file_search')])] | |
) | |
# Create an Assistant (or fetch it if it already exists) | |
assistants = client.beta.assistants.list() | |
myAssistant = None | |
for assistant in assistants: | |
if assistant.name == 'My Assistant Name': | |
myAssistant = assistant | |
break | |
if myAssistant is None: | |
# Create assistant | |
myAssistant = client.beta.assistants.create( | |
model='gpt-4o', | |
description='You are a PDF retrieval assistant.', | |
instructions="You are a helpful assistant designed to output only JSON. Find information from the text and files provided.", | |
tools=[{"type": "file_search"}], | |
# response_format={"type": "json_object"}, # Isn't possible | |
name='My Assistant Name', | |
) | |
# Run the created thread with the assistant. It will wait until the message is processed. | |
run = client.beta.threads.runs.create_and_poll( | |
thread_id=thread.id, | |
assistant_id=myAssistant.id, | |
instructions="Please output in JSON format", | |
timeout=300, # 5 minutes | |
# response_format={"type": "json_object"}, # Isn't possible | |
) | |
# Eg. issue with openai server | |
if run.status != "completed": | |
raise Exception('Run failed:', run.status) | |
# Fetch outputs of the thread | |
messages_cursor = client.beta.threads.messages.list(thread_id=thread.id) | |
messages = [message for message in messages_cursor] | |
message = messages[0] | |
assert message.content[0].type == "text" | |
# Output of the Assistant | |
res_txt = message.content[0].text.value | |
# Because assistant can't produce JSON (as we're using "file_search"), | |
# it will output text + some JSON code. We can parse and extract just | |
# the JSON part, and ignore everything else (eg. gpt4o will start with something | |
# similar to "Of course, here's the parsed text: {useful_JSON_here}") | |
if res_txt.startswith('```json'): | |
res_txt = res_txt[6:] | |
if res_txt.endswith('```'): | |
res_txt = res_txt[:-3] | |
res_txt = res_txt[:res_txt.rfind('}')+1] | |
res_txt = res_txt[res_txt.find('{'):] | |
res_txt.strip() | |
# Parse the JSON output | |
data = json.loads(res_txt) | |
print(data) | |
# Delete the file(s) afterwards to preserve space (max 100gb/company) | |
delete_ok = client.files.delete(file.id) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment