Last active
January 4, 2024 05:54
-
-
Save dfeldman/cb790287621098fcd84731668404dde2 to your computer and use it in GitHub Desktop.
Script to send a PDF file to ChatGPT page by page
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import base64 | |
import requests | |
import io | |
import hashlib | |
from PyPDF2 import PdfFileReader | |
from pdf2image import convert_from_path | |
# OpenAI API Key | |
api_key = "API KEY HERE" | |
prompt = ( | |
"Perform a detailed OCR (Optical Character Recognition) analysis on the provided image. " | |
"Extract all readable text. Identify and list the date of the document at the beginning, if determinable. " | |
"Also, provide a comma-separated list of any names mentioned in the text. " | |
"Then, transcribe the entire content of the page.\n\n") | |
# Function to encode the image to base64 | |
def encode_image(image): | |
buffered = io.BytesIO() | |
image.save(buffered, format="JPEG") | |
return base64.b64encode(buffered.getvalue()).decode('utf-8') | |
# Function to process each page | |
def process_page(page, output_file, page_number, progress_file): | |
print("encode page ", page_number) | |
base64_image = encode_image(page) | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {api_key}" | |
} | |
payload = { | |
"model": "gpt-4-vision-preview", | |
"messages": [ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": prompt, | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{base64_image}" | |
} | |
} | |
] | |
} | |
], | |
"max_tokens": 300 | |
} | |
print("post page ", page_number) | |
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) | |
if response.status_code == 200: | |
response_data = response.json() | |
content = response_data.get('choices', [{}])[0].get('message', {}).get('content', 'No content available') | |
with open(output_file, 'a') as f: | |
f.write(f"Page {page_number}:\n{content}\n\n") | |
f.write("-"*60) | |
f.write("\n\n") | |
with open(progress_file, 'w') as f: | |
f.write(str(page_number)) | |
print(f"Processed page {page_number}") | |
else: | |
print(f"Error processing page {page_number}: {response.status_code}") | |
# Function to compute file hash | |
def compute_hash(file_path): | |
hasher = hashlib.sha256() | |
with open(file_path, 'rb') as f: | |
buf = f.read() | |
hasher.update(buf) | |
return hasher.hexdigest() | |
def main(pdf_path, start_page, end_page): | |
if not os.path.exists(pdf_path): | |
print("PDF file does not exist.") | |
return | |
file_hash = compute_hash(pdf_path) | |
progress_file = f"{file_hash}_progress.txt" | |
output_file = f"{os.path.splitext(pdf_path)[0]}_output.txt" | |
last_processed_page = 0 | |
if os.path.exists(progress_file): | |
with open(progress_file, 'r') as f: | |
last_processed_page = int(f.read()) | |
if last_processed_page >= end_page: | |
print("Processing already completed for this range.") | |
return | |
try: | |
pages = convert_from_path(pdf_path) | |
for i in range(max(start_page - 1, last_processed_page), end_page): | |
print("processing page ", i) | |
process_page(pages[i], output_file, i+1, progress_file) | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
if __name__ == "__main__": | |
if len(sys.argv) != 4: | |
print("Usage: python script.py <path_to_pdf> <start_page> <end_page>") | |
else: | |
pdf_path = sys.argv[1] | |
start_page = int(sys.argv[2]) | |
end_page = int(sys.argv[3]) | |
main(pdf_path, start_page, end_page) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment