Created
December 1, 2024 23:53
-
-
Save kyleavery/5a2820f450e1c1a8aff425d1cc6d9790 to your computer and use it in GitHub Desktop.
PDF to Markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import base64 | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
import openai | |
from pdf2image import convert_from_path | |
from PIL import Image | |
OUTPUT_DIR = "test" | |
INPUT_PDF = "test.pdf" | |
TRIM_LEFT = 60 | |
TRIM_TOP = 50 | |
TRIM_RIGHT = 60 | |
TRIM_BOTTOM = 60 | |
images = convert_from_path( | |
pdf_path=INPUT_PDF, | |
dpi=90, | |
thread_count=8, | |
jpegopt={ | |
"quality": 1, | |
"progressive": False, | |
"optimize": True, | |
}, | |
grayscale=True, | |
size=(600, None), | |
) | |
def save_cropped_image(): | |
for i in range(len(images)): | |
img = images[i] | |
img = img.crop((TRIM_LEFT, TRIM_TOP, img.width - TRIM_RIGHT, img.height - TRIM_BOTTOM)) | |
img.save(OUTPUT_DIR + '/page'+ str(i+1) +'.jpg', 'JPEG') | |
client = openai.OpenAI() | |
PROMPT = "Convert this book page to markdown. ONLY REPLY WITH THE MARKDOWN." | |
def convert_image_to_md(image_path: str) -> str: | |
img_type = "image/jpeg" | |
with open(image_path, "rb") as f: | |
img_b64_str = base64.b64encode(f.read()).decode("utf-8") | |
completion = client.chat.completions.create( | |
model="gpt-4o-mini", | |
messages=[ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": PROMPT}, | |
{ | |
"type": "image_url", | |
"image_url": {"url": f"data:{img_type};base64,{img_b64_str}"}, | |
}, | |
], | |
} | |
], | |
) | |
return completion.choices[0].message.content | |
def convert_images_to_md(): | |
images = [f"./{OUTPUT_DIR}/{image}" for image in os.listdir("./{OUTPUT_DIR}") if image.endswith(".jpg")] | |
def process_image(image_path): | |
md = convert_image_to_md(image_path) | |
if md.startswith("```markdown\n") and md.endswith("\n```"): | |
md = md[12:-4] | |
elif md.startswith("```\n") and md.endswith("\n```"): | |
md = md[4:-4] | |
with open(f"./{OUTPUT_DIR}/md/{os.path.basename(image_path)}.md", "w") as f: | |
f.write(md) | |
with ThreadPoolExecutor(max_workers=4) as executor: | |
futures = [executor.submit(process_image, image) for image in images] | |
for future in as_completed(futures): | |
future.result() | |
save_cropped_image() | |
convert_images_to_md() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment