dfeldman · January 4, 2024 05:54
diff --git a/pdttochatgpt.py b/pdttochatgpt.py
 import sys
 import os
 import base64
 import requests
 import io
 import hashlib
 from PyPDF2 import PdfFileReader
 from pdf2image import convert_from_path

 # OpenAI API Key
 api_key = "API KEY HERE"

 prompt = (
    "Perform a detailed OCR (Optical Character Recognition) analysis on the provided image. "
    "Extract all readable text. Identify and list the date of the document at the beginning, if determinable. "
    "Also, provide a comma-separated list of any names mentioned in the text. "
    "Then, transcribe the entire content of the page.\n\n")

 # Function to encode the image to base64
 def encode_image(image):
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

 # Function to process each page
 def process_page(page, output_file, page_number, progress_file):
    print("encode page ", page_number)

    base64_image = encode_image(page)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 300
    }

    print("post page ", page_number)

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    if response.status_code == 200:
        response_data = response.json()
        content = response_data.get('choices', [{}])[0].get('message', {}).get('content', 'No content available')

        with open(output_file, 'a') as f:
            f.write(f"Page {page_number}:\n{content}\n\n")
            f.write("-"*60)
            f.write("\n\n")
        with open(progress_file, 'w') as f:
            f.write(str(page_number))
        print(f"Processed page {page_number}")
    else:
        print(f"Error processing page {page_number}: {response.status_code}")

 # Function to compute file hash
 def compute_hash(file_path):
    hasher = hashlib.sha256()
    with open(file_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

 def main(pdf_path, start_page, end_page):
    if not os.path.exists(pdf_path):
        print("PDF file does not exist.")
        return

    file_hash = compute_hash(pdf_path)
    progress_file = f"{file_hash}_progress.txt"
    output_file = f"{os.path.splitext(pdf_path)[0]}_output.txt"
    
    last_processed_page = 0
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            last_processed_page = int(f.read())

    if last_processed_page >= end_page:
        print("Processing already completed for this range.")
        return

    try:
        pages = convert_from_path(pdf_path)
        for i in range(max(start_page - 1, last_processed_page), end_page):
            print("processing page ", i)
            process_page(pages[i], output_file, i+1, progress_file)
    except Exception as e:
        print(f"An error occurred: {e}")

 if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: python script.py <path_to_pdf> <start_page> <end_page>")
    else:
        pdf_path = sys.argv[1]
        start_page = int(sys.argv[2])
        end_page = int(sys.argv[3])
        main(pdf_path, start_page, end_page)
	import sys
	import os
	import base64
	import requests
	import io
	import hashlib
	from PyPDF2 import PdfFileReader
	from pdf2image import convert_from_path

	# OpenAI API Key
	api_key = "API KEY HERE"

	prompt = (
	"Perform a detailed OCR (Optical Character Recognition) analysis on the provided image. "
	"Extract all readable text. Identify and list the date of the document at the beginning, if determinable. "
	"Also, provide a comma-separated list of any names mentioned in the text. "
	"Then, transcribe the entire content of the page.\n\n")

	# Function to encode the image to base64
	def encode_image(image):
	buffered = io.BytesIO()
	image.save(buffered, format="JPEG")
	return base64.b64encode(buffered.getvalue()).decode('utf-8')

	# Function to process each page
	def process_page(page, output_file, page_number, progress_file):
	print("encode page ", page_number)

	base64_image = encode_image(page)

	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {api_key}"
	}

	payload = {
	"model": "gpt-4-vision-preview",
	"messages": [
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": prompt,
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{base64_image}"
	}
	}
	]
	}
	],
	"max_tokens": 300
	}

	print("post page ", page_number)

	response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

	if response.status_code == 200:
	response_data = response.json()
	content = response_data.get('choices', [{}])[0].get('message', {}).get('content', 'No content available')

	with open(output_file, 'a') as f:
	f.write(f"Page {page_number}:\n{content}\n\n")
	f.write("-"*60)
	f.write("\n\n")
	with open(progress_file, 'w') as f:
	f.write(str(page_number))
	print(f"Processed page {page_number}")
	else:
	print(f"Error processing page {page_number}: {response.status_code}")

	# Function to compute file hash
	def compute_hash(file_path):
	hasher = hashlib.sha256()
	with open(file_path, 'rb') as f:
	buf = f.read()
	hasher.update(buf)
	return hasher.hexdigest()

	def main(pdf_path, start_page, end_page):
	if not os.path.exists(pdf_path):
	print("PDF file does not exist.")
	return

	file_hash = compute_hash(pdf_path)
	progress_file = f"{file_hash}_progress.txt"
	output_file = f"{os.path.splitext(pdf_path)[0]}_output.txt"

	last_processed_page = 0
	if os.path.exists(progress_file):
	with open(progress_file, 'r') as f:
	last_processed_page = int(f.read())

	if last_processed_page >= end_page:
	print("Processing already completed for this range.")
	return

	try:
	pages = convert_from_path(pdf_path)
	for i in range(max(start_page - 1, last_processed_page), end_page):
	print("processing page ", i)
	process_page(pages[i], output_file, i+1, progress_file)
	except Exception as e:
	print(f"An error occurred: {e}")

	if __name__ == "__main__":
	if len(sys.argv) != 4:
	print("Usage: python script.py <path_to_pdf> <start_page> <end_page>")
	else:
	pdf_path = sys.argv[1]
	start_page = int(sys.argv[2])
	end_page = int(sys.argv[3])
	main(pdf_path, start_page, end_page)