-
-
Save rguliev/3d886d38daa8ac0be8ddb85d645fb0bc to your computer and use it in GitHub Desktop.
Python 3: pdfminer code to convert pdf to text, html or xml
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Use `pip3 install pdfminer.six` for python3 | |
from typing import Container | |
from io import BytesIO | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import TextConverter, XMLConverter, HTMLConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfpage import PDFPage | |
def convert_pdf( | |
path: str, | |
format: str = "text", | |
codec: str = "utf-8", | |
password: str = "", | |
maxpages: int = 0, | |
caching: bool = True, | |
pagenos: Container[int] = set(), | |
) -> str: | |
"""Summary | |
Parameters | |
---------- | |
path : str | |
Path to the pdf file | |
format : str, optional | |
Format of output, must be one of: "text", "html", "xml". | |
By default, "text" format is used | |
codec : str, optional | |
Encoding. By default "utf-8" is used | |
password : str, optional | |
Password | |
maxpages : int, optional | |
Max number of pages to convert. By default is 0, i.e. reads all pages. | |
caching : bool, optional | |
Caching. By default is True | |
pagenos : Container[int], optional | |
Provide a list with numbers of pages to convert | |
Returns | |
------- | |
str | |
Converted pdf file | |
""" | |
rsrcmgr = PDFResourceManager() | |
retstr = BytesIO() | |
laparams = LAParams() | |
if format == "text": | |
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) | |
elif format == "html": | |
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) | |
elif format == "xml": | |
device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) | |
else: | |
raise ValueError("provide format, either text, html or xml!") | |
fp = open(path, "rb") | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
for page in PDFPage.get_pages( | |
fp, | |
pagenos, | |
maxpages=maxpages, | |
password=password, | |
caching=caching, | |
check_extractable=True, | |
): | |
interpreter.process_page(page) | |
text = retstr.getvalue().decode() | |
fp.close() | |
device.close() | |
retstr.close() | |
return text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thnks