Skip to content

Instantly share code, notes, and snippets.

@elecnix
Created November 6, 2023 01:39
Show Gist options
  • Save elecnix/6cddad5e278e44a013749b810c9e98e3 to your computer and use it in GitHub Desktop.
Save elecnix/6cddad5e278e44a013749b810c9e98e3 to your computer and use it in GitHub Desktop.
OCR with ChatGPT Plus
"""
OCR with ChatGPT Plus
Convert a bunch of image files into text, using Playwright to control your Chrome browser.
Prerequisites:
- Run `chrome --remote-debugging-port=9222`
- Log in to ChatGPT
- Select GPT-4
"""
import asyncio
import base64
import mimetypes
from pathlib import Path
from playwright.async_api import async_playwright
from playwright.async_api import Browser
from playwright.async_api import Page
async def drag_and_drop(page: Page, file: Path, drop_zone_selector: str) -> None:
with open(file, 'rb') as f: buffer = f.read()
pdf_base64 = base64.b64encode(buffer).decode("utf-8")
mime_type, _ = mimetypes.guess_type(file)
data_transfer = await page.evaluate_handle("""(arg) => {
[data, filename, mime_type] = arg;
const dt = new DataTransfer();
const hex = Uint8Array.from(atob(data), c => c.charCodeAt(0));
const file = new File([hex], filename, { type: mime_type });
dt.items.add(file);
return dt;
}""", [pdf_base64, file.name, mime_type])
await page.dispatch_event(drop_zone_selector, 'drop', {"dataTransfer": data_transfer})
async def send(page: Page, file_path : str, prompt : str):
await page.wait_for_selector('#prompt-textarea')
await drag_and_drop(page, file_path, '#prompt-textarea')
await asyncio.sleep(3)
await page.fill('#prompt-textarea', prompt)
await page.wait_for_selector('button[data-testid="send-button"]:enabled', timeout=1000*60);
await page.press('#prompt-textarea', 'Enter')
await page.wait_for_selector('button:has-text("Regenerate")', timeout=1000*60*5);
turns = await page.query_selector_all('[data-testid^="conversation-turn-"]')
return await turns[-1].text_content();
async def main():
async with async_playwright() as playwright:
prompt = """Transcribe in Markdown. Write page number. Answer only with the transcription."""
directory = Path('./')
files = [entry for entry in directory.iterdir() if entry.is_file()]
browser : Browser = await playwright.chromium.connect_over_cdp("http://localhost:9222")
page = await browser.contexts[0].new_page()
await page.goto("https://chat.openai.com/")
for file in files:
retries = 5
for attempt in range(retries):
try:
print(f"Processing file: {file}")
response = await send(page, file, prompt)
print(response)
break
except TimeoutError:
print(f"Retrying {attempt + 1}/{retries}...")
if attempt < retries - 1:
await asyncio.sleep(5)
else:
print("Max retries reached, skipping file.")
await browser.close()
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment