Skip to content

Instantly share code, notes, and snippets.

@kyleavery
Created December 1, 2024 23:53
Show Gist options
  • Save kyleavery/5a2820f450e1c1a8aff425d1cc6d9790 to your computer and use it in GitHub Desktop.
Save kyleavery/5a2820f450e1c1a8aff425d1cc6d9790 to your computer and use it in GitHub Desktop.
PDF to Markdown
import os
import base64
from concurrent.futures import ThreadPoolExecutor, as_completed
import openai
from pdf2image import convert_from_path
from PIL import Image
OUTPUT_DIR = "test"
INPUT_PDF = "test.pdf"
TRIM_LEFT = 60
TRIM_TOP = 50
TRIM_RIGHT = 60
TRIM_BOTTOM = 60
images = convert_from_path(
pdf_path=INPUT_PDF,
dpi=90,
thread_count=8,
jpegopt={
"quality": 1,
"progressive": False,
"optimize": True,
},
grayscale=True,
size=(600, None),
)
def save_cropped_image():
for i in range(len(images)):
img = images[i]
img = img.crop((TRIM_LEFT, TRIM_TOP, img.width - TRIM_RIGHT, img.height - TRIM_BOTTOM))
img.save(OUTPUT_DIR + '/page'+ str(i+1) +'.jpg', 'JPEG')
client = openai.OpenAI()
PROMPT = "Convert this book page to markdown. ONLY REPLY WITH THE MARKDOWN."
def convert_image_to_md(image_path: str) -> str:
img_type = "image/jpeg"
with open(image_path, "rb") as f:
img_b64_str = base64.b64encode(f.read()).decode("utf-8")
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": PROMPT},
{
"type": "image_url",
"image_url": {"url": f"data:{img_type};base64,{img_b64_str}"},
},
],
}
],
)
return completion.choices[0].message.content
def convert_images_to_md():
images = [f"./{OUTPUT_DIR}/{image}" for image in os.listdir("./{OUTPUT_DIR}") if image.endswith(".jpg")]
def process_image(image_path):
md = convert_image_to_md(image_path)
if md.startswith("```markdown\n") and md.endswith("\n```"):
md = md[12:-4]
elif md.startswith("```\n") and md.endswith("\n```"):
md = md[4:-4]
with open(f"./{OUTPUT_DIR}/md/{os.path.basename(image_path)}.md", "w") as f:
f.write(md)
with ThreadPoolExecutor(max_workers=4) as executor:
futures = [executor.submit(process_image, image) for image in images]
for future in as_completed(futures):
future.result()
save_cropped_image()
convert_images_to_md()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment