pszemraj · October 11, 2024 03:40
diff --git a/pdf2markdown_openai.py b/pdf2markdown_openai.py
 import base64
 import tempfile
 from pathlib import Path

 import fire
 import mdformat
 from joblib import Memory
 from openai import OpenAI
 from pdf2image import convert_from_path
 from tqdm.auto import tqdm

 # Set up joblib caching
 cache_dir = Path.home() / ".cache" / ".joblib"
 cache_dir.mkdir(parents=True, exist_ok=True)
 memory = Memory(cache_dir, verbose=0)

 PROMPT_TEXT = """Please rewrite the text in the image into well-formatted, clean markdown. You do not need to use a markdown code block, just make sure your output is markdown only.
 Try to reproduce the essentials of figures in the image. For very simple figures that can be concisely represented with ASCII symbols, use ASCII. For plots or complicated figures, instead write a description of the figure and its purpose. If the image is blank, simply return [NO_CONTENT_FOUND]"""

 # assumes OpenAI API key in your env vars as "OPENAI_API_KEY"
 client = OpenAI(max_retries=10, timeout=180)


 def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


 @memory.cache
 def process_image(
    image_path,
    prompt: str = PROMPT_TEXT,
    model_name: str = "gpt-4o-mini",
 ):
    base64_image = encode_image(image_path)
    response = client.chat.completions.create(
        model=model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": "high",
                        },
                    },
                ],
            }
        ],
        max_tokens=4000,
        temperature=0,
    )
    return response.choices[0].message.content


 def process_pdf(
    pdf_path,
    out_dir=None,
    model_name: str = "gpt-4o-mini",
 ):
    """
    process_pdf - Convert PDF to images and transcribe to markdown
    :param str pdf_path: path to the PDF file
    :param str out_dir: output directory, defaults to None
    :param str model_name: model name, defaults to "gpt-4o-mini"
    """
    pdf_path = Path(pdf_path)
    assert pdf_path.exists(), f"PDF file {pdf_path} does not exist"

    if out_dir is None:
        out_path = pdf_path.parent / f"{pdf_path.stem}_markdown-{model_name}"
    else:
        out_path = Path(out_dir)
    out_path.mkdir(parents=True, exist_ok=True)

    with tempfile.TemporaryDirectory() as temp_dir:
        print("Converting PDF to images...")
        images = convert_from_path(pdf_path)

        all_markdown = []
        for i, image in enumerate(tqdm(images, desc="Processing pages")):
            image_path = Path(temp_dir) / f"page_{i+1}.jpg"
            image.save(str(image_path), "JPEG")

            markdown_content = process_image(str(image_path), model_name=model_name)
            output_file = out_path / f"page_{i+1}.md"
            with output_file.open("w", encoding="utf-8") as f:
                f.write(markdown_content.strip())
            all_markdown.append(markdown_content)

    # write the overall doc to a file
    with (out_path / f"{pdf_path.stem}.md").open("w", encoding="utf-8") as f:
        all_content_sections = [
            s for s in all_markdown if not "[NO_CONTENT_FOUND]" in s.strip()
        ]
        try:
            full_doc_text = mdformat.text(
                "\n\n".join(all_content_sections),
                options={
                    "number": True,  # switch on consecutive numbering of ordered lists
                    "wrap": "no",
                },
            )
        except Exception as e:
            print(f"unable to format with mdformat: {e}")
            full_doc_text = "\n\n".join(all_content_sections)
        f.write(full_doc_text)

    print(f"Saved to:\n\t{str(out_path)}")


 if __name__ == "__main__":
    fire.Fire(process_pdf)
	import base64
	import tempfile
	from pathlib import Path

	import fire
	import mdformat
	from joblib import Memory
	from openai import OpenAI
	from pdf2image import convert_from_path
	from tqdm.auto import tqdm

	# Set up joblib caching
	cache_dir = Path.home() / ".cache" / ".joblib"
	cache_dir.mkdir(parents=True, exist_ok=True)
	memory = Memory(cache_dir, verbose=0)

	PROMPT_TEXT = """Please rewrite the text in the image into well-formatted, clean markdown. You do not need to use a markdown code block, just make sure your output is markdown only.
	Try to reproduce the essentials of figures in the image. For very simple figures that can be concisely represented with ASCII symbols, use ASCII. For plots or complicated figures, instead write a description of the figure and its purpose. If the image is blank, simply return [NO_CONTENT_FOUND]"""

	# assumes OpenAI API key in your env vars as "OPENAI_API_KEY"
	client = OpenAI(max_retries=10, timeout=180)


	def encode_image(image_path):
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode("utf-8")


	@memory.cache
	def process_image(
	image_path,
	prompt: str = PROMPT_TEXT,
	model_name: str = "gpt-4o-mini",
	):
	base64_image = encode_image(image_path)
	response = client.chat.completions.create(
	model=model_name,
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": prompt,
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{base64_image}",
	"detail": "high",
	},
	},
	],
	}
	],
	max_tokens=4000,
	temperature=0,
	)
	return response.choices[0].message.content


	def process_pdf(
	pdf_path,
	out_dir=None,
	model_name: str = "gpt-4o-mini",
	):
	"""
	process_pdf - Convert PDF to images and transcribe to markdown
	:param str pdf_path: path to the PDF file
	:param str out_dir: output directory, defaults to None
	:param str model_name: model name, defaults to "gpt-4o-mini"
	"""
	pdf_path = Path(pdf_path)
	assert pdf_path.exists(), f"PDF file {pdf_path} does not exist"

	if out_dir is None:
	out_path = pdf_path.parent / f"{pdf_path.stem}_markdown-{model_name}"
	else:
	out_path = Path(out_dir)
	out_path.mkdir(parents=True, exist_ok=True)

	with tempfile.TemporaryDirectory() as temp_dir:
	print("Converting PDF to images...")
	images = convert_from_path(pdf_path)

	all_markdown = []
	for i, image in enumerate(tqdm(images, desc="Processing pages")):
	image_path = Path(temp_dir) / f"page_{i+1}.jpg"
	image.save(str(image_path), "JPEG")

	markdown_content = process_image(str(image_path), model_name=model_name)
	output_file = out_path / f"page_{i+1}.md"
	with output_file.open("w", encoding="utf-8") as f:
	f.write(markdown_content.strip())
	all_markdown.append(markdown_content)

	# write the overall doc to a file
	with (out_path / f"{pdf_path.stem}.md").open("w", encoding="utf-8") as f:
	all_content_sections = [
	s for s in all_markdown if not "[NO_CONTENT_FOUND]" in s.strip()
	]
	try:
	full_doc_text = mdformat.text(
	"\n\n".join(all_content_sections),
	options={
	"number": True, # switch on consecutive numbering of ordered lists
	"wrap": "no",
	},
	)
	except Exception as e:
	print(f"unable to format with mdformat: {e}")
	full_doc_text = "\n\n".join(all_content_sections)
	f.write(full_doc_text)

	print(f"Saved to:\n\t{str(out_path)}")


	if __name__ == "__main__":
	fire.Fire(process_pdf)