Skip to content

Instantly share code, notes, and snippets.

@yudataguy
Forked from rguliev/convert_pdf.py
Created May 2, 2023 21:32
Show Gist options
  • Save yudataguy/d86fbae5b283b1c8d6584f9ff11d2ba8 to your computer and use it in GitHub Desktop.
Save yudataguy/d86fbae5b283b1c8d6584f9ff11d2ba8 to your computer and use it in GitHub Desktop.
Python 3: pdfminer code to convert pdf to text, html or xml
# Use `pip3 install pdfminer.six` for python3
from typing import Container
from io import BytesIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, XMLConverter, HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def convert_pdf(
path: str,
format: str = "text",
codec: str = "utf-8",
password: str = "",
maxpages: int = 0,
caching: bool = True,
pagenos: Container[int] = set(),
) -> str:
"""Summary
Parameters
----------
path : str
Path to the pdf file
format : str, optional
Format of output, must be one of: "text", "html", "xml".
By default, "text" format is used
codec : str, optional
Encoding. By default "utf-8" is used
password : str, optional
Password
maxpages : int, optional
Max number of pages to convert. By default is 0, i.e. reads all pages.
caching : bool, optional
Caching. By default is True
pagenos : Container[int], optional
Provide a list with numbers of pages to convert
Returns
-------
str
Converted pdf file
"""
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
laparams = LAParams()
if format == "text":
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == "html":
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == "xml":
device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
else:
raise ValueError("provide format, either text, html or xml!")
fp = open(path, "rb")
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(
fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
):
interpreter.process_page(page)
text = retstr.getvalue().decode()
fp.close()
device.close()
retstr.close()
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment