Created
October 11, 2024 03:40
-
-
Save pszemraj/b690e626663301b3243094338c406e4b to your computer and use it in GitHub Desktop.
transcribe PDF files to markdown text with openai
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
import tempfile | |
from pathlib import Path | |
import fire | |
import mdformat | |
from joblib import Memory | |
from openai import OpenAI | |
from pdf2image import convert_from_path | |
from tqdm.auto import tqdm | |
# Set up joblib caching | |
cache_dir = Path.home() / ".cache" / ".joblib" | |
cache_dir.mkdir(parents=True, exist_ok=True) | |
memory = Memory(cache_dir, verbose=0) | |
PROMPT_TEXT = """Please rewrite the text in the image into well-formatted, clean markdown. You do not need to use a markdown code block, just make sure your output is markdown only. | |
Try to reproduce the essentials of figures in the image. For very simple figures that can be concisely represented with ASCII symbols, use ASCII. For plots or complicated figures, instead write a description of the figure and its purpose. If the image is blank, simply return [NO_CONTENT_FOUND]""" | |
# assumes OpenAI API key in your env vars as "OPENAI_API_KEY" | |
client = OpenAI(max_retries=10, timeout=180) | |
def encode_image(image_path): | |
with open(image_path, "rb") as image_file: | |
return base64.b64encode(image_file.read()).decode("utf-8") | |
@memory.cache | |
def process_image( | |
image_path, | |
prompt: str = PROMPT_TEXT, | |
model_name: str = "gpt-4o-mini", | |
): | |
base64_image = encode_image(image_path) | |
response = client.chat.completions.create( | |
model=model_name, | |
messages=[ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": prompt, | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{base64_image}", | |
"detail": "high", | |
}, | |
}, | |
], | |
} | |
], | |
max_tokens=4000, | |
temperature=0, | |
) | |
return response.choices[0].message.content | |
def process_pdf( | |
pdf_path, | |
out_dir=None, | |
model_name: str = "gpt-4o-mini", | |
): | |
""" | |
process_pdf - Convert PDF to images and transcribe to markdown | |
:param str pdf_path: path to the PDF file | |
:param str out_dir: output directory, defaults to None | |
:param str model_name: model name, defaults to "gpt-4o-mini" | |
""" | |
pdf_path = Path(pdf_path) | |
assert pdf_path.exists(), f"PDF file {pdf_path} does not exist" | |
if out_dir is None: | |
out_path = pdf_path.parent / f"{pdf_path.stem}_markdown-{model_name}" | |
else: | |
out_path = Path(out_dir) | |
out_path.mkdir(parents=True, exist_ok=True) | |
with tempfile.TemporaryDirectory() as temp_dir: | |
print("Converting PDF to images...") | |
images = convert_from_path(pdf_path) | |
all_markdown = [] | |
for i, image in enumerate(tqdm(images, desc="Processing pages")): | |
image_path = Path(temp_dir) / f"page_{i+1}.jpg" | |
image.save(str(image_path), "JPEG") | |
markdown_content = process_image(str(image_path), model_name=model_name) | |
output_file = out_path / f"page_{i+1}.md" | |
with output_file.open("w", encoding="utf-8") as f: | |
f.write(markdown_content.strip()) | |
all_markdown.append(markdown_content) | |
# write the overall doc to a file | |
with (out_path / f"{pdf_path.stem}.md").open("w", encoding="utf-8") as f: | |
all_content_sections = [ | |
s for s in all_markdown if not "[NO_CONTENT_FOUND]" in s.strip() | |
] | |
try: | |
full_doc_text = mdformat.text( | |
"\n\n".join(all_content_sections), | |
options={ | |
"number": True, # switch on consecutive numbering of ordered lists | |
"wrap": "no", | |
}, | |
) | |
except Exception as e: | |
print(f"unable to format with mdformat: {e}") | |
full_doc_text = "\n\n".join(all_content_sections) | |
f.write(full_doc_text) | |
print(f"Saved to:\n\t{str(out_path)}") | |
if __name__ == "__main__": | |
fire.Fire(process_pdf) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment