Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Created October 11, 2024 03:40
Show Gist options
  • Save pszemraj/b690e626663301b3243094338c406e4b to your computer and use it in GitHub Desktop.
Save pszemraj/b690e626663301b3243094338c406e4b to your computer and use it in GitHub Desktop.
transcribe PDF files to markdown text with openai
import base64
import tempfile
from pathlib import Path
import fire
import mdformat
from joblib import Memory
from openai import OpenAI
from pdf2image import convert_from_path
from tqdm.auto import tqdm
# Set up joblib caching
cache_dir = Path.home() / ".cache" / ".joblib"
cache_dir.mkdir(parents=True, exist_ok=True)
memory = Memory(cache_dir, verbose=0)
PROMPT_TEXT = """Please rewrite the text in the image into well-formatted, clean markdown. You do not need to use a markdown code block, just make sure your output is markdown only.
Try to reproduce the essentials of figures in the image. For very simple figures that can be concisely represented with ASCII symbols, use ASCII. For plots or complicated figures, instead write a description of the figure and its purpose. If the image is blank, simply return [NO_CONTENT_FOUND]"""
# assumes OpenAI API key in your env vars as "OPENAI_API_KEY"
client = OpenAI(max_retries=10, timeout=180)
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
@memory.cache
def process_image(
image_path,
prompt: str = PROMPT_TEXT,
model_name: str = "gpt-4o-mini",
):
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high",
},
},
],
}
],
max_tokens=4000,
temperature=0,
)
return response.choices[0].message.content
def process_pdf(
pdf_path,
out_dir=None,
model_name: str = "gpt-4o-mini",
):
"""
process_pdf - Convert PDF to images and transcribe to markdown
:param str pdf_path: path to the PDF file
:param str out_dir: output directory, defaults to None
:param str model_name: model name, defaults to "gpt-4o-mini"
"""
pdf_path = Path(pdf_path)
assert pdf_path.exists(), f"PDF file {pdf_path} does not exist"
if out_dir is None:
out_path = pdf_path.parent / f"{pdf_path.stem}_markdown-{model_name}"
else:
out_path = Path(out_dir)
out_path.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory() as temp_dir:
print("Converting PDF to images...")
images = convert_from_path(pdf_path)
all_markdown = []
for i, image in enumerate(tqdm(images, desc="Processing pages")):
image_path = Path(temp_dir) / f"page_{i+1}.jpg"
image.save(str(image_path), "JPEG")
markdown_content = process_image(str(image_path), model_name=model_name)
output_file = out_path / f"page_{i+1}.md"
with output_file.open("w", encoding="utf-8") as f:
f.write(markdown_content.strip())
all_markdown.append(markdown_content)
# write the overall doc to a file
with (out_path / f"{pdf_path.stem}.md").open("w", encoding="utf-8") as f:
all_content_sections = [
s for s in all_markdown if not "[NO_CONTENT_FOUND]" in s.strip()
]
try:
full_doc_text = mdformat.text(
"\n\n".join(all_content_sections),
options={
"number": True, # switch on consecutive numbering of ordered lists
"wrap": "no",
},
)
except Exception as e:
print(f"unable to format with mdformat: {e}")
full_doc_text = "\n\n".join(all_content_sections)
f.write(full_doc_text)
print(f"Saved to:\n\t{str(out_path)}")
if __name__ == "__main__":
fire.Fire(process_pdf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment